rley 0.6.01 → 0.6.02
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -1
- data/examples/NLP/engtagger.rb +58 -60
- data/lib/rley/constants.rb +1 -1
- metadata +2 -33
- data/examples/general/SRL/lib/ast_builder.rb +0 -382
- data/examples/general/SRL/lib/grammar.rb +0 -106
- data/examples/general/SRL/lib/regex/abstract_method.rb +0 -35
- data/examples/general/SRL/lib/regex/alternation.rb +0 -27
- data/examples/general/SRL/lib/regex/anchor.rb +0 -45
- data/examples/general/SRL/lib/regex/atomic_expression.rb +0 -16
- data/examples/general/SRL/lib/regex/capturing_group.rb +0 -51
- data/examples/general/SRL/lib/regex/char_class.rb +0 -38
- data/examples/general/SRL/lib/regex/char_range.rb +0 -51
- data/examples/general/SRL/lib/regex/char_shorthand.rb +0 -50
- data/examples/general/SRL/lib/regex/character.rb +0 -204
- data/examples/general/SRL/lib/regex/compound_expression.rb +0 -57
- data/examples/general/SRL/lib/regex/concatenation.rb +0 -29
- data/examples/general/SRL/lib/regex/expression.rb +0 -60
- data/examples/general/SRL/lib/regex/lookaround.rb +0 -50
- data/examples/general/SRL/lib/regex/match_option.rb +0 -34
- data/examples/general/SRL/lib/regex/monadic_expression.rb +0 -28
- data/examples/general/SRL/lib/regex/multiplicity.rb +0 -91
- data/examples/general/SRL/lib/regex/non_capturing_group.rb +0 -27
- data/examples/general/SRL/lib/regex/polyadic_expression.rb +0 -60
- data/examples/general/SRL/lib/regex/quantifiable.rb +0 -22
- data/examples/general/SRL/lib/regex/repetition.rb +0 -29
- data/examples/general/SRL/lib/regex/wildcard.rb +0 -23
- data/examples/general/SRL/lib/regex_repr.rb +0 -13
- data/examples/general/SRL/lib/tokenizer.rb +0 -147
- data/examples/general/SRL/spec/integration_spec.rb +0 -448
- data/examples/general/SRL/spec/regex/character_spec.rb +0 -166
- data/examples/general/SRL/spec/regex/multiplicity_spec.rb +0 -79
- data/examples/general/SRL/spec/spec_helper.rb +0 -25
- data/examples/general/SRL/spec/tokenizer_spec.rb +0 -148
- data/examples/general/SRL/srl_demo.rb +0 -75
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9764b3e89d5395584b93ca2a06d80a3381d58560
|
4
|
+
data.tar.gz: 91eb805167c77da1d86cf23e04382d9c791f47c5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9dc181140698cab328974cbbca977ccbd0de1afc5eb0af628f2c7517fbb1ad93af058341e6146542540adcd7ea7268dd3a234131b3b046838c6a26e2cf05146e
|
7
|
+
data.tar.gz: 90e957d8e37e957b50c769de6661c3ce5f5ad9d6c48beb5c7a8e7487622568273f715c5bacdbbd7ed382f410c6cd9b420218ac447e92fc279fd901b6e0a2baea
|
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,8 @@
|
|
1
|
-
### 0.6.
|
1
|
+
### 0.6.02 / 2018-03-03
|
2
|
+
* [FIX] File `examples/general/NLP/engtagger.rb` ode re-styling to remove most style offenses found by Rubocop 0.52.1
|
3
|
+
* [DELETE] All files `examples/general/SRL` are removed. It will become a new gem by itself.
|
4
|
+
|
5
|
+
### 0.6.01 / 2018-03-03
|
2
6
|
* [FIX] Code re-styling to remove most style offenses found by Rubocop 0.52.1
|
3
7
|
|
4
8
|
### 0.6.00 / 2018-02-25
|
data/examples/NLP/engtagger.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require "pp"
|
1
|
+
require 'rley'
|
2
|
+
require 'engtagger' # Load POS (Part-Of-Speech) tagger EngTagger
|
4
3
|
|
5
4
|
# REGEX to remove XML tags from Engtagger output
|
6
5
|
GET_TAG = /<(.+?)>(.*?)<.+?>/
|
@@ -23,7 +22,7 @@ end
|
|
23
22
|
def valid_text(text)
|
24
23
|
if !text
|
25
24
|
# there's nothing to parse
|
26
|
-
|
25
|
+
puts 'method call on uninitialized variable'
|
27
26
|
return false
|
28
27
|
elsif /\A\s*\z/ =~ text
|
29
28
|
# text is an empty string, nothing to parse
|
@@ -36,26 +35,26 @@ end
|
|
36
35
|
|
37
36
|
def split_sentences(array)
|
38
37
|
tokenized = array
|
39
|
-
people = %w
|
40
|
-
supt det mssrs rev
|
41
|
-
army = %w
|
42
|
-
inst = %w
|
43
|
-
place = %w
|
44
|
-
hwy hway la pde pd plz pl rd st tce
|
45
|
-
comp = %w
|
46
|
-
state = %w
|
38
|
+
people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
|
39
|
+
supt det mssrs rev]
|
40
|
+
army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
|
41
|
+
inst = %w[dept univ assn bros ph.d]
|
42
|
+
place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
|
43
|
+
hwy hway la pde pd plz pl rd st tce]
|
44
|
+
comp = %w[mfg inc ltd co corp]
|
45
|
+
state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
|
47
46
|
ind ia kans kan ken ky la me md is mass mich minn miss mo mont
|
48
47
|
neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
|
49
|
-
va wash wis wisc wy wyo usafa alta man ont que sask yuk
|
50
|
-
month = %w
|
51
|
-
misc = %w
|
48
|
+
va wash wis wisc wy wyo usafa alta man ont que sask yuk]
|
49
|
+
month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
|
50
|
+
misc = %w[vs etc no esp]
|
52
51
|
abbr = Hash.new
|
53
52
|
[people, army, inst, place, comp, state, month, misc].flatten.each do |i|
|
54
53
|
abbr[i] = true
|
55
54
|
end
|
56
55
|
words = Array.new
|
57
|
-
tokenized.each_with_index do |
|
58
|
-
if tokenized[i + 1]
|
56
|
+
tokenized.each_with_index do |_t, i|
|
57
|
+
if tokenized[i + 1] && tokenized [i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
|
59
58
|
w = $1
|
60
59
|
# Don't separate the period off words that
|
61
60
|
# meet any of the following conditions:
|
@@ -63,7 +62,7 @@ def split_sentences(array)
|
|
63
62
|
# 1. It is defined in one of the lists above
|
64
63
|
# 2. It is only one letter long: Alfred E. Sloan
|
65
64
|
# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
|
66
|
-
unless abbr[w.downcase]
|
65
|
+
unless abbr[w.downcase] || w =~ /\A[a-z]\z/i || w =~ /[a-z](?:\.[a-z])+\z/i
|
67
66
|
words << w
|
68
67
|
words << '.'
|
69
68
|
next
|
@@ -72,7 +71,7 @@ def split_sentences(array)
|
|
72
71
|
words << tokenized[i]
|
73
72
|
end
|
74
73
|
# If the final word ends in a period..
|
75
|
-
if words[-1]
|
74
|
+
if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
|
76
75
|
words[-1] = $1
|
77
76
|
words.push '.'
|
78
77
|
end
|
@@ -85,39 +84,42 @@ def split_punct(text)
|
|
85
84
|
# If there's no punctuation, return immediately
|
86
85
|
return [text] if /\A\w+\z/ =~ text
|
87
86
|
# Sanity checks
|
88
|
-
text = text.gsub(/\W{10,}/o,
|
87
|
+
text = text.gsub(/\W{10,}/o, ' ')
|
89
88
|
|
90
89
|
# Put quotes into a standard format
|
91
|
-
text = text.gsub(/`(?!`)(?=.*\w)/o,
|
92
|
-
text = text.gsub(/"(?=.*\w)/o,
|
93
|
-
text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 +
|
90
|
+
text = text.gsub(/`(?!`)(?=.*\w)/o, '` ') # Shift left quotes off text
|
91
|
+
text = text.gsub(/"(?=.*\w)/o, ' `` ') # Convert left quotes to ``
|
92
|
+
text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? $1 + ' ` ' : ' ` ' } # Convert left quote to `
|
94
93
|
text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
|
95
|
-
text = text.gsub(/(\w)'(?!')(?=\W|$)/o
|
94
|
+
text = text.gsub(/(\w)'(?!')(?=\W|$)/o, "\\1 ' ") # Separate right single quotes
|
96
95
|
|
97
96
|
# Handle all other punctuation
|
98
|
-
text = text.gsub(/--+/o,
|
99
|
-
text = text.gsub(/,(?!\d)/o,
|
100
|
-
text = text.gsub(/:/o,
|
101
|
-
text = text.gsub(/(\.\.\.+)/o
|
102
|
-
text = text.gsub(/([\(\[\{\}\]\)])/o
|
103
|
-
text = text.gsub(/([\!\?#\$%;~|])/o
|
97
|
+
text = text.gsub(/--+/o, ' - ') # Convert and separate dashes
|
98
|
+
text = text.gsub(/,(?!\d)/o, ' , ') # Shift comma if not following by digit
|
99
|
+
text = text.gsub(/:/o, ' :') # Shift semicolon off
|
100
|
+
text = text.gsub(/(\.\.\.+)/o, ' \1 ') # Shift ellipses off
|
101
|
+
text = text.gsub(/([\(\[\{\}\]\)])/o, ' \1 ') # Shift off brackets
|
102
|
+
text = text.gsub(/([\!\?#\$%;~|])/o, ' \1 ') # Shift off other ``standard'' punctuation
|
104
103
|
|
105
104
|
# English-specific contractions
|
106
|
-
text = text.gsub(/([A-Za-z])'([dms])\b/o
|
107
|
-
text = text.gsub(/n't\b/o, " n't")
|
108
|
-
text = text.gsub(/'(ve|ll|re)\b/o
|
105
|
+
text = text.gsub(/([A-Za-z])'([dms])\b/o, "\\1 '\\2") # Separate off 'd 'm 's
|
106
|
+
text = text.gsub(/n't\b/o, " n't") # Separate off n't
|
107
|
+
text = text.gsub(/'(ve|ll|re)\b/o, " '\\1") # Separate off 've, 'll, 're
|
109
108
|
result = text.split(' ')
|
110
109
|
return result
|
111
110
|
end
|
112
111
|
|
113
112
|
|
114
|
-
# Instantiate a
|
115
|
-
|
113
|
+
# Instantiate a facade object as our Rley interface
|
114
|
+
nlp_engine = Rley::Engine.new
|
116
115
|
|
117
|
-
|
116
|
+
# Now build a very simplified English grammar...
|
117
|
+
nlp_engine.build_grammar do
|
118
|
+
# Terminals have same names as POS tags returned by Engtagger
|
119
|
+
add_terminals('NN', 'NNP')
|
118
120
|
add_terminals('DET', 'IN', 'VBD')
|
119
121
|
|
120
|
-
# Here we define the productions (= grammar rules)
|
122
|
+
# Here we define the productions (= grammar rules)
|
121
123
|
rule 'S' => %w[NP VP]
|
122
124
|
rule 'NP' => 'NNP'
|
123
125
|
rule 'NP' => %w[DET NN]
|
@@ -125,52 +127,48 @@ builder = Rley::Syntax::GrammarBuilder.new do
|
|
125
127
|
rule 'VP' => %w[VBD NP]
|
126
128
|
rule 'VP' => %w[VBD NP PP]
|
127
129
|
rule 'PP' => %w[IN NP]
|
128
|
-
end
|
129
|
-
|
130
|
-
# And now, let's build the grammar...
|
131
|
-
grammar = builder.grammar
|
132
|
-
|
133
|
-
parser = Rley::Parser::GFGEarleyParser.new(grammar)
|
130
|
+
end
|
134
131
|
|
135
132
|
# text = "Yo I'm not done with you"
|
136
|
-
text=
|
137
|
-
|
133
|
+
text = 'John saw Mary with a telescope'
|
134
|
+
puts "Input text --> #{text}"
|
138
135
|
|
139
136
|
tgr = EngTagger.new
|
140
137
|
|
141
|
-
#
|
138
|
+
# Generate raw POS output
|
142
139
|
tagged = tgr.add_tags(text)
|
143
140
|
|
144
141
|
# Generte tokenied lexicon of input text
|
145
|
-
# Instead of creating a lexicon dictionary,
|
142
|
+
# Instead of creating a lexicon dictionary,
|
143
|
+
# we would simply generate one each time on the fly for the current text only.
|
146
144
|
lexicon = clean_text(text)
|
147
145
|
|
148
|
-
#
|
146
|
+
# Convert EngTagger POS tokens in [[word, pos], ..] format
|
149
147
|
tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
|
150
148
|
|
151
|
-
def tokenizer(lexicon,
|
149
|
+
def tokenizer(lexicon, tokens)
|
152
150
|
rley_tokens = []
|
153
|
-
lexicon.each_with_index do |word, i|
|
151
|
+
lexicon.each_with_index do |word, i|
|
154
152
|
term_name = tokens[i].last
|
155
|
-
|
156
|
-
rley_tokens << Rley::Lexical::Token.new(word, terminal)
|
153
|
+
rley_tokens << Rley::Lexical::Token.new(word, term_name)
|
157
154
|
end
|
158
155
|
return rley_tokens
|
159
156
|
end
|
160
157
|
|
161
158
|
# Convert input text into a sequence of rley token objects...
|
162
|
-
rley_tokens = tokenizer(lexicon,
|
159
|
+
rley_tokens = tokenizer(lexicon, tokens)
|
163
160
|
|
164
|
-
|
161
|
+
# Let Rley grok the tokens
|
162
|
+
result = nlp_engine.parse(rley_tokens)
|
165
163
|
|
166
|
-
|
167
|
-
|
164
|
+
puts "Parsing successful? #{result.success?}" # => Parsing successful? true
|
165
|
+
puts result.failure_reason.message unless result.success?
|
168
166
|
|
169
|
-
ptree = result
|
167
|
+
ptree = nlp_engine.convert(result)
|
170
168
|
|
171
|
-
visitor =
|
169
|
+
visitor = nlp_engine.ptree_visitor(ptree)
|
172
170
|
|
173
171
|
renderer = Rley::Formatter::Asciitree.new($stdout)
|
174
172
|
|
175
|
-
#
|
176
|
-
|
173
|
+
# Let's visualize the parse tree (in text format...)
|
174
|
+
puts renderer.render(visitor)
|
data/lib/rley/constants.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rley
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.02
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dimitri Geshef
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-03-
|
11
|
+
date: 2018-03-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: coveralls
|
@@ -142,37 +142,6 @@ files:
|
|
142
142
|
- examples/data_formats/JSON/json_grammar.rb
|
143
143
|
- examples/data_formats/JSON/json_lexer.rb
|
144
144
|
- examples/data_formats/JSON/json_minifier.rb
|
145
|
-
- examples/general/SRL/lib/ast_builder.rb
|
146
|
-
- examples/general/SRL/lib/grammar.rb
|
147
|
-
- examples/general/SRL/lib/regex/abstract_method.rb
|
148
|
-
- examples/general/SRL/lib/regex/alternation.rb
|
149
|
-
- examples/general/SRL/lib/regex/anchor.rb
|
150
|
-
- examples/general/SRL/lib/regex/atomic_expression.rb
|
151
|
-
- examples/general/SRL/lib/regex/capturing_group.rb
|
152
|
-
- examples/general/SRL/lib/regex/char_class.rb
|
153
|
-
- examples/general/SRL/lib/regex/char_range.rb
|
154
|
-
- examples/general/SRL/lib/regex/char_shorthand.rb
|
155
|
-
- examples/general/SRL/lib/regex/character.rb
|
156
|
-
- examples/general/SRL/lib/regex/compound_expression.rb
|
157
|
-
- examples/general/SRL/lib/regex/concatenation.rb
|
158
|
-
- examples/general/SRL/lib/regex/expression.rb
|
159
|
-
- examples/general/SRL/lib/regex/lookaround.rb
|
160
|
-
- examples/general/SRL/lib/regex/match_option.rb
|
161
|
-
- examples/general/SRL/lib/regex/monadic_expression.rb
|
162
|
-
- examples/general/SRL/lib/regex/multiplicity.rb
|
163
|
-
- examples/general/SRL/lib/regex/non_capturing_group.rb
|
164
|
-
- examples/general/SRL/lib/regex/polyadic_expression.rb
|
165
|
-
- examples/general/SRL/lib/regex/quantifiable.rb
|
166
|
-
- examples/general/SRL/lib/regex/repetition.rb
|
167
|
-
- examples/general/SRL/lib/regex/wildcard.rb
|
168
|
-
- examples/general/SRL/lib/regex_repr.rb
|
169
|
-
- examples/general/SRL/lib/tokenizer.rb
|
170
|
-
- examples/general/SRL/spec/integration_spec.rb
|
171
|
-
- examples/general/SRL/spec/regex/character_spec.rb
|
172
|
-
- examples/general/SRL/spec/regex/multiplicity_spec.rb
|
173
|
-
- examples/general/SRL/spec/spec_helper.rb
|
174
|
-
- examples/general/SRL/spec/tokenizer_spec.rb
|
175
|
-
- examples/general/SRL/srl_demo.rb
|
176
145
|
- examples/general/calc_iter1/calc_ast_builder.rb
|
177
146
|
- examples/general/calc_iter1/calc_ast_nodes.rb
|
178
147
|
- examples/general/calc_iter1/calc_demo.rb
|
@@ -1,382 +0,0 @@
|
|
1
|
-
require 'stringio'
|
2
|
-
require_relative 'regex_repr'
|
3
|
-
|
4
|
-
# The purpose of a ASTBuilder is to build piece by piece an AST
|
5
|
-
# (Abstract Syntax Tree) from a sequence of input tokens and
|
6
|
-
# visit events produced by walking over a GFGParsing object.
|
7
|
-
# Uses the Builder GoF pattern.
|
8
|
-
# The Builder pattern creates a complex object
|
9
|
-
# (say, a parse tree) from simpler objects (terminal and non-terminal
|
10
|
-
# nodes) and using a step by step approach.
|
11
|
-
class ASTBuilder < Rley::ParseRep::ASTBaseBuilder
|
12
|
-
Terminal2NodeClass = {}.freeze
|
13
|
-
|
14
|
-
attr_reader :options
|
15
|
-
|
16
|
-
protected
|
17
|
-
|
18
|
-
def terminal2node()
|
19
|
-
Terminal2NodeClass
|
20
|
-
end
|
21
|
-
|
22
|
-
# Overriding method.
|
23
|
-
# Factory method for creating a node object for the given
|
24
|
-
# input token.
|
25
|
-
# @param aTerminal [Terminal] Terminal symbol associated with the token
|
26
|
-
# @param aTokenPosition [Integer] Position of token in the input stream
|
27
|
-
# @param aToken [Token] The input token
|
28
|
-
def new_leaf_node(_production, _terminal, aTokenPosition, aToken)
|
29
|
-
node = Rley::PTree::TerminalNode.new(aToken, aTokenPosition)
|
30
|
-
|
31
|
-
return node
|
32
|
-
end
|
33
|
-
|
34
|
-
def multiplicity(lowerBound, upperBound)
|
35
|
-
return SRL::Regex::Multiplicity.new(lowerBound, upperBound, :greedy)
|
36
|
-
end
|
37
|
-
|
38
|
-
def string_literal(aString, to_escape = true)
|
39
|
-
if aString.size > 1
|
40
|
-
chars = []
|
41
|
-
aString.each_char do |ch|
|
42
|
-
if to_escape && Regex::Character::MetaChars.include?(ch)
|
43
|
-
chars << Regex::Character.new("\\")
|
44
|
-
end
|
45
|
-
chars << Regex::Character.new(ch)
|
46
|
-
end
|
47
|
-
result = Regex::Concatenation.new(*chars)
|
48
|
-
elsif to_escape && Regex::Character::MetaChars.include?(aString)
|
49
|
-
backslash = Regex::Character.new("\\")
|
50
|
-
a_string = Regex::Character.new(aString)
|
51
|
-
result = Regex::Concatenation.new(backslash, a_string)
|
52
|
-
else
|
53
|
-
result = Regex::Character.new(aString)
|
54
|
-
end
|
55
|
-
|
56
|
-
return result
|
57
|
-
end
|
58
|
-
|
59
|
-
def char_range(lowerBound, upperBound)
|
60
|
-
# TODO fix module nesting
|
61
|
-
lower = Regex::Character.new(lowerBound)
|
62
|
-
upper = Regex::Character.new(upperBound)
|
63
|
-
return Regex::CharRange.new(lower, upper)
|
64
|
-
end
|
65
|
-
|
66
|
-
def char_class(toNegate, *theChildren)
|
67
|
-
Regex::CharClass.new(toNegate, *theChildren)
|
68
|
-
end
|
69
|
-
|
70
|
-
def char_shorthand(shortName)
|
71
|
-
Regex::CharShorthand.new(shortName)
|
72
|
-
end
|
73
|
-
|
74
|
-
def wildcard()
|
75
|
-
Regex::Wildcard.new
|
76
|
-
end
|
77
|
-
|
78
|
-
def repetition(expressionToRepeat, aMultiplicity)
|
79
|
-
return Regex::Repetition.new(expressionToRepeat, aMultiplicity)
|
80
|
-
end
|
81
|
-
|
82
|
-
def begin_anchor
|
83
|
-
return Regex::Anchor.new('^')
|
84
|
-
end
|
85
|
-
|
86
|
-
# rule('expression' => %w[pattern separator flags]).as 'flagged_expr'
|
87
|
-
def reduce_flagged_expr(_production, aRange, theTokens, theChildren)
|
88
|
-
@options = theChildren[2] if theChildren[2]
|
89
|
-
return_first_child(aRange, theTokens, theChildren)
|
90
|
-
end
|
91
|
-
|
92
|
-
# rule('pattern' => %w[pattern separator quantifiable]).as 'pattern_sequence'
|
93
|
-
def reduce_pattern_sequence(_production, _range, _tokens, theChildren)
|
94
|
-
return Regex::Concatenation.new(theChildren[0], theChildren[2])
|
95
|
-
end
|
96
|
-
|
97
|
-
# rule('flags' => %[flags separator single_flag]).as 'flag_sequence'
|
98
|
-
def reduce_flag_sequence(_production, _range, _tokens, theChildren)
|
99
|
-
theChildren[0] << theChildren[2]
|
100
|
-
end
|
101
|
-
|
102
|
-
# rule('single_flag' => %w[CASE INSENSITIVE]).as 'case_insensitive'
|
103
|
-
def reduce_case_insensitive(_production, _range, _tokens, _children)
|
104
|
-
return [Regex::MatchOption.new(:IGNORECASE, true)]
|
105
|
-
end
|
106
|
-
|
107
|
-
# rule('single_flag' => %w[MULTI LINE]).as 'multi_line'
|
108
|
-
def reduce_multi_line(_production, _range, _tokens, _children)
|
109
|
-
return [Regex::MatchOption.new(:MULTILINE, true)]
|
110
|
-
end
|
111
|
-
|
112
|
-
# rule('single_flag' => %w[ALL LAZY]).as 'all_lazy'
|
113
|
-
def reduce_all_lazy(_production, _range, _tokens, _children)
|
114
|
-
return [Regex::MatchOption.new(:ALL_LAZY, true)]
|
115
|
-
end
|
116
|
-
|
117
|
-
# rule 'quantifiable' => %w[begin_anchor anchorable end_anchor]
|
118
|
-
def reduce_pinned_quantifiable(_production, _range, _tokens, theChildren)
|
119
|
-
theChildren[1].begin_anchor = theChildren[0]
|
120
|
-
theChildren[1].end_anchor = theChildren[2]
|
121
|
-
return theChildren[1]
|
122
|
-
end
|
123
|
-
|
124
|
-
# rule 'quantifiable' => %w[begin_anchor anchorable]
|
125
|
-
def reduce_begin_anchor_quantifiable(_production, _range, _tokens, theChildren)
|
126
|
-
theChildren[1].begin_anchor = theChildren[0]
|
127
|
-
return theChildren[1]
|
128
|
-
end
|
129
|
-
|
130
|
-
# rule 'quantifiable' => %w[anchorable end_anchor]
|
131
|
-
def reduce_end_anchor_quantifiable(_production, _range, _tokens, theChildren)
|
132
|
-
theChildren[0].end_anchor = theChildren[1]
|
133
|
-
return theChildren[0]
|
134
|
-
end
|
135
|
-
|
136
|
-
# rule 'begin_anchor' => %w[STARTS WITH]
|
137
|
-
def reduce_starts_with(_production, _range, _tokens, _children)
|
138
|
-
begin_anchor
|
139
|
-
end
|
140
|
-
|
141
|
-
# rule 'begin_anchor' => %w[BEGIN WITH]
|
142
|
-
def reduce_begin_with(_production, _range, _tokens, _children)
|
143
|
-
begin_anchor
|
144
|
-
end
|
145
|
-
|
146
|
-
# rule 'end_anchor' => %w[MUST END].as 'end_anchor'
|
147
|
-
def reduce_end_anchor(_production, _range, _tokens, _children)
|
148
|
-
return Regex::Anchor.new('$')
|
149
|
-
end
|
150
|
-
|
151
|
-
# rule('anchorable' => %w[assertable assertion]).as 'asserted_anchorable'
|
152
|
-
def reduce_asserted_anchorable(_production, _range, _tokens, theChildren)
|
153
|
-
assertion = theChildren.last
|
154
|
-
assertion.children.unshift(theChildren[0])
|
155
|
-
return assertion
|
156
|
-
end
|
157
|
-
|
158
|
-
# rule('assertion' => %w[IF FOLLOWED BY assertable]).as 'if_followed'
|
159
|
-
def reduce_if_followed(_production, _range, _tokens, theChildren)
|
160
|
-
return Regex::Lookaround.new(theChildren.last, :ahead, :positive)
|
161
|
-
end
|
162
|
-
|
163
|
-
# rule('assertion' => %w[IF NOT FOLLOWED BY assertable]).as 'if_not_followed'
|
164
|
-
def reduce_if_not_followed(_production, _range, _tokens, theChildren)
|
165
|
-
return Regex::Lookaround.new(theChildren.last, :ahead, :negative)
|
166
|
-
end
|
167
|
-
|
168
|
-
# rule('assertion' => %w[IF ALREADY HAD assertable]).as 'if_had'
|
169
|
-
def reduce_if_had(_production, _range, _tokens, theChildren)
|
170
|
-
return Regex::Lookaround.new(theChildren.last, :behind, :positive)
|
171
|
-
end
|
172
|
-
|
173
|
-
# rule('assertion' => %w[IF NOT ALREADY HAD assertable]).as 'if_not_had'
|
174
|
-
def reduce_if_not_had(_production, _range, _tokens, theChildren)
|
175
|
-
return Regex::Lookaround.new(theChildren.last, :behind, :negative)
|
176
|
-
end
|
177
|
-
|
178
|
-
# rule('assertable' => %w[term quantifier]).as 'quantified_assertable'
|
179
|
-
def reduce_quantified_assertable(_production, _range, _tokens, theChildren)
|
180
|
-
quantifier = theChildren[1]
|
181
|
-
term = theChildren[0]
|
182
|
-
repetition(term, quantifier)
|
183
|
-
end
|
184
|
-
|
185
|
-
# rule('letter_range' => %w[LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'lowercase_from_to'
|
186
|
-
def reduce_lowercase_from_to(_production, _range, _tokens, theChildren)
|
187
|
-
lower = theChildren[2].token.lexeme
|
188
|
-
upper = theChildren[4].token.lexeme
|
189
|
-
ch_range = char_range(lower, upper)
|
190
|
-
char_class(false, ch_range)
|
191
|
-
end
|
192
|
-
|
193
|
-
# rule('letter_range' => %w[UPPERCASE LETTER FROM LETTER_LIT TO LETTER_LIT]).as 'uppercase_from_to'
|
194
|
-
def reduce_uppercase_from_to(_production, _range, _tokens, theChildren)
|
195
|
-
lower = theChildren[3].token.lexeme
|
196
|
-
upper = theChildren[5].token.lexeme
|
197
|
-
ch_range = char_range(lower.upcase, upper.upcase)
|
198
|
-
char_class(false, ch_range)
|
199
|
-
end
|
200
|
-
|
201
|
-
# rule('letter_range' => 'LETTER').as 'any_lowercase'
|
202
|
-
def reduce_any_lowercase(_production, _range, _tokens, _children)
|
203
|
-
ch_range = char_range('a', 'z')
|
204
|
-
char_class(false, ch_range)
|
205
|
-
end
|
206
|
-
|
207
|
-
# rule('letter_range' => %w[UPPERCASE LETTER]).as 'any_uppercase'
|
208
|
-
def reduce_any_uppercase(_production, _range, _tokens, _children)
|
209
|
-
ch_range = char_range('A', 'Z')
|
210
|
-
char_class(false, ch_range)
|
211
|
-
end
|
212
|
-
|
213
|
-
# rule('digit_range' => %w[digit_or_number FROM DIGIT_LIT TO DIGIT_LIT]).as 'digits_from_to'
|
214
|
-
def reduce_digits_from_to(aProduction, aRange, theTokens, theChildren)
|
215
|
-
reduce_lowercase_from_to(aProduction, aRange, theTokens, theChildren)
|
216
|
-
end
|
217
|
-
|
218
|
-
# rule('digit_range' => 'digit_or_number').as 'simple_digit_range'
|
219
|
-
def reduce_simple_digit_range(_production, _range, _tokens, _children)
|
220
|
-
char_shorthand('d')
|
221
|
-
end
|
222
|
-
|
223
|
-
# rule('character_class' => %w[ANY CHARACTER]).as 'any_character'
|
224
|
-
def reduce_any_character(_production, _range, _tokens, _children)
|
225
|
-
char_shorthand('w')
|
226
|
-
end
|
227
|
-
|
228
|
-
# rule('character_class' => %w[NO CHARACTER]).as 'no_character'
|
229
|
-
def reduce_no_character(_production, _range, _tokens, _children)
|
230
|
-
char_shorthand('W')
|
231
|
-
end
|
232
|
-
|
233
|
-
# rule('character_class' => 'WHITESPACE').as 'whitespace'
|
234
|
-
def reduce_whitespace(_production, _range, _tokens, _children)
|
235
|
-
char_shorthand('s')
|
236
|
-
end
|
237
|
-
|
238
|
-
# rule('character_class' => %w[NO WHITESPACE]).as 'no_whitespace'
|
239
|
-
def reduce_no_whitespace(_production, _range, _tokens, _children)
|
240
|
-
char_shorthand('S')
|
241
|
-
end
|
242
|
-
|
243
|
-
# rule('character_class' => 'ANYTHING').as 'anything'
|
244
|
-
def reduce_anything(_production, _range, _tokens, _children)
|
245
|
-
wildcard
|
246
|
-
end
|
247
|
-
|
248
|
-
# rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
|
249
|
-
def reduce_one_of(_production, _range, _tokens, theChildren)
|
250
|
-
raw_literal = theChildren[-1].token.lexeme.dup
|
251
|
-
alternatives = raw_literal.chars.map { |ch| Regex::Character.new(ch) }
|
252
|
-
# TODO check other implementations
|
253
|
-
return Regex::CharClass.new(false, *alternatives)
|
254
|
-
end
|
255
|
-
|
256
|
-
# rule('special_char' => 'TAB').as 'tab'
|
257
|
-
def reduce_tab(_production, _range, _tokens, _children)
|
258
|
-
Regex::Character.new('\t')
|
259
|
-
end
|
260
|
-
|
261
|
-
# rule('special_char' => 'BACKSLASH').as 'backslash'
|
262
|
-
def reduce_backslash(_production, _range, _tokens, _children)
|
263
|
-
Regex::Character.new('\\')
|
264
|
-
end
|
265
|
-
|
266
|
-
# rule('special_char' => %w[NEW LINE]).as 'new_line'
|
267
|
-
def reduce_new_line(_production, _range, _tokens, _children)
|
268
|
-
# TODO: control portability
|
269
|
-
Regex::Character.new('\n')
|
270
|
-
end
|
271
|
-
|
272
|
-
# rule('literal' => %w[LITERALLY STRING_LIT]).as 'literally'
|
273
|
-
def reduce_literally(_production, _range, _tokens, theChildren)
|
274
|
-
# What if literal is empty?...
|
275
|
-
|
276
|
-
raw_literal = theChildren[-1].token.lexeme.dup
|
277
|
-
return string_literal(raw_literal)
|
278
|
-
end
|
279
|
-
|
280
|
-
# rule('alternation' => %w[ANY OF LPAREN alternatives RPAREN]).as 'any_of'
|
281
|
-
def reduce_any_of(_production, _range, _tokens, theChildren)
|
282
|
-
return Regex::Alternation.new(*theChildren[3])
|
283
|
-
end
|
284
|
-
|
285
|
-
# rule('alternatives' => %w[alternatives separator quantifiable]).as 'alternative_list'
|
286
|
-
def reduce_alternative_list(_production, _range, _tokens, theChildren)
|
287
|
-
return theChildren[0] << theChildren[-1]
|
288
|
-
end
|
289
|
-
|
290
|
-
# rule('alternatives' => 'quantifiable').as 'simple_alternative'
|
291
|
-
def reduce_simple_alternative(_production, _range, _tokens, theChildren)
|
292
|
-
return [theChildren.last]
|
293
|
-
end
|
294
|
-
|
295
|
-
# rule('grouping' => %w[LPAREN pattern RPAREN]).as 'grouping_parenthenses'
|
296
|
-
def reduce_grouping_parenthenses(_production, _range, _tokens, theChildren)
|
297
|
-
return Regex::NonCapturingGroup.new(theChildren[1])
|
298
|
-
end
|
299
|
-
|
300
|
-
# rule('capturing_group' => %w[CAPTURE assertable]).as 'capture'
|
301
|
-
def reduce_capture(_production, _range, _tokens, theChildren)
|
302
|
-
return Regex::CapturingGroup.new(theChildren[1])
|
303
|
-
end
|
304
|
-
|
305
|
-
# rule('capturing_group' => %w[CAPTURE assertable UNTIL assertable]).as
|
306
|
-
# 'capture_until'
|
307
|
-
def reduce_capture_until(_production, _range, _tokens, theChildren)
|
308
|
-
group = Regex::CapturingGroup.new(theChildren[1])
|
309
|
-
return Regex::Concatenation.new(group, theChildren[3])
|
310
|
-
end
|
311
|
-
|
312
|
-
# rule('capturing_group' => %w[CAPTURE assertable AS var_name]).as
|
313
|
-
# 'named_capture'
|
314
|
-
def reduce_named_capture(_production, _range, _tokens, theChildren)
|
315
|
-
name = theChildren[3].token.lexeme.dup
|
316
|
-
return Regex::CapturingGroup.new(theChildren[1], name)
|
317
|
-
end
|
318
|
-
|
319
|
-
# rule('capturing_group' => %w[CAPTURE assertable AS var_name
|
320
|
-
# UNTIL assertable]).as 'named_capture_until'
|
321
|
-
def reduce_named_capture_until(_production, _range, _tokens, theChildren)
|
322
|
-
name = theChildren[3].token.lexeme.dup
|
323
|
-
group = Regex::CapturingGroup.new(theChildren[1], name)
|
324
|
-
return Regex::Concatenation.new(group, theChildren[5])
|
325
|
-
end
|
326
|
-
|
327
|
-
# rule('quantifier' => 'ONCE').as 'once'
|
328
|
-
def reduce_once(_production, _range, _tokens, _children)
|
329
|
-
multiplicity(1, 1)
|
330
|
-
end
|
331
|
-
|
332
|
-
# rule('quantifier' => 'TWICE').as 'twice'
|
333
|
-
def reduce_twice(_production, _range, _tokens, _children)
|
334
|
-
multiplicity(2, 2)
|
335
|
-
end
|
336
|
-
|
337
|
-
# rule('quantifier' => %w[EXACTLY count TIMES]).as 'exactly'
|
338
|
-
def reduce_exactly(_production, _range, _tokens, theChildren)
|
339
|
-
count = theChildren[1].token.lexeme.to_i
|
340
|
-
multiplicity(count, count)
|
341
|
-
end
|
342
|
-
|
343
|
-
# rule('quantifier' => %w[BETWEEN count AND count times_suffix]).as
|
344
|
-
# 'between_and'
|
345
|
-
def reduce_between_and(_production, _range, _tokens, theChildren)
|
346
|
-
lower = theChildren[1].token.lexeme.to_i
|
347
|
-
upper = theChildren[3].token.lexeme.to_i
|
348
|
-
multiplicity(lower, upper)
|
349
|
-
end
|
350
|
-
|
351
|
-
# rule('quantifier' => 'OPTIONAL').as 'optional'
|
352
|
-
def reduce_optional(_production, _range, _tokens, _children)
|
353
|
-
multiplicity(0, 1)
|
354
|
-
end
|
355
|
-
|
356
|
-
# rule('quantifier' => %w[ONCE OR MORE]).as 'once_or_more'
|
357
|
-
def reduce_once_or_more(_production, _range, _tokens, _children)
|
358
|
-
multiplicity(1, :more)
|
359
|
-
end
|
360
|
-
|
361
|
-
# rule('quantifier' => %w[NEVER OR MORE]).as 'never_or_more'
|
362
|
-
def reduce_never_or_more(_production, _range, _tokens, _children)
|
363
|
-
multiplicity(0, :more)
|
364
|
-
end
|
365
|
-
|
366
|
-
# rule('quantifier' => %w[AT LEAST count TIMES]).as 'at_least'
|
367
|
-
def reduce_at_least(_production, _range, _tokens, theChildren)
|
368
|
-
count = theChildren[2].token.lexeme.to_i
|
369
|
-
multiplicity(count, :more)
|
370
|
-
end
|
371
|
-
|
372
|
-
# rule('times_suffix' => 'TIMES').as 'times_keyword'
|
373
|
-
def reduce_times_keyword(_production, _range, _tokens, _children)
|
374
|
-
return nil
|
375
|
-
end
|
376
|
-
|
377
|
-
# rule('times_suffix' => []).as 'times_dropped'
|
378
|
-
def reduce_times_dropped(_production, _range, _tokens, _children)
|
379
|
-
return nil
|
380
|
-
end
|
381
|
-
end # class
|
382
|
-
# End of file
|