rley 0.7.07 → 0.7.08
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +348 -54
- data/LICENSE.txt +1 -1
- data/README.md +3 -2
- data/examples/NLP/engtagger.rb +193 -190
- data/examples/NLP/nano_eng/nano_grammar.rb +5 -5
- data/examples/data_formats/JSON/cli_options.rb +1 -1
- data/examples/data_formats/JSON/json_ast_builder.rb +12 -9
- data/examples/data_formats/JSON/json_ast_nodes.rb +12 -21
- data/examples/data_formats/JSON/json_grammar.rb +2 -2
- data/examples/data_formats/JSON/json_lexer.rb +8 -8
- data/examples/data_formats/JSON/json_minifier.rb +1 -1
- data/examples/general/calc_iter1/calc_ast_builder.rb +13 -10
- data/examples/general/calc_iter1/calc_ast_nodes.rb +23 -37
- data/examples/general/calc_iter1/calc_grammar.rb +2 -2
- data/examples/general/calc_iter1/calc_lexer.rb +6 -4
- data/examples/general/calc_iter1/spec/calculator_spec.rb +5 -5
- data/examples/general/calc_iter2/calc_ast_builder.rb +5 -3
- data/examples/general/calc_iter2/calc_ast_nodes.rb +27 -43
- data/examples/general/calc_iter2/calc_grammar.rb +3 -3
- data/examples/general/calc_iter2/calc_lexer.rb +11 -10
- data/examples/general/calc_iter2/spec/calculator_spec.rb +26 -26
- data/examples/general/left.rb +2 -2
- data/examples/general/right.rb +2 -2
- data/lib/rley/base/dotted_item.rb +23 -31
- data/lib/rley/constants.rb +2 -2
- data/lib/rley/engine.rb +20 -23
- data/lib/rley/formatter/asciitree.rb +3 -3
- data/lib/rley/formatter/bracket_notation.rb +1 -8
- data/lib/rley/formatter/debug.rb +6 -6
- data/lib/rley/formatter/json.rb +2 -2
- data/lib/rley/gfg/call_edge.rb +1 -1
- data/lib/rley/gfg/edge.rb +5 -5
- data/lib/rley/gfg/end_vertex.rb +2 -6
- data/lib/rley/gfg/epsilon_edge.rb +1 -5
- data/lib/rley/gfg/grm_flow_graph.rb +27 -23
- data/lib/rley/gfg/item_vertex.rb +10 -10
- data/lib/rley/gfg/non_terminal_vertex.rb +4 -4
- data/lib/rley/gfg/scan_edge.rb +1 -1
- data/lib/rley/gfg/shortcut_edge.rb +2 -2
- data/lib/rley/gfg/start_vertex.rb +4 -8
- data/lib/rley/gfg/vertex.rb +43 -39
- data/lib/rley/lexical/token_range.rb +6 -6
- data/lib/rley/parse_forest_visitor.rb +5 -5
- data/lib/rley/parse_rep/ast_base_builder.rb +9 -11
- data/lib/rley/parse_rep/cst_builder.rb +5 -6
- data/lib/rley/parse_rep/parse_forest_builder.rb +20 -18
- data/lib/rley/parse_rep/parse_forest_factory.rb +3 -3
- data/lib/rley/parse_rep/parse_rep_creator.rb +11 -13
- data/lib/rley/parse_rep/parse_tree_builder.rb +4 -4
- data/lib/rley/parse_rep/parse_tree_factory.rb +27 -27
- data/lib/rley/parse_tree_visitor.rb +1 -1
- data/lib/rley/parser/error_reason.rb +4 -5
- data/lib/rley/parser/gfg_chart.rb +20 -22
- data/lib/rley/parser/gfg_parsing.rb +16 -30
- data/lib/rley/parser/parse_entry.rb +25 -31
- data/lib/rley/parser/parse_entry_set.rb +18 -15
- data/lib/rley/parser/parse_entry_tracker.rb +4 -4
- data/lib/rley/parser/parse_state.rb +16 -21
- data/lib/rley/parser/parse_state_tracker.rb +4 -4
- data/lib/rley/parser/parse_tracer.rb +13 -13
- data/lib/rley/parser/parse_walker_factory.rb +23 -28
- data/lib/rley/parser/state_set.rb +9 -10
- data/lib/rley/ptree/non_terminal_node.rb +7 -5
- data/lib/rley/ptree/parse_tree.rb +3 -3
- data/lib/rley/ptree/parse_tree_node.rb +5 -5
- data/lib/rley/ptree/terminal_node.rb +7 -7
- data/lib/rley/rley_error.rb +12 -12
- data/lib/rley/sppf/alternative_node.rb +6 -6
- data/lib/rley/sppf/composite_node.rb +7 -7
- data/lib/rley/sppf/epsilon_node.rb +3 -3
- data/lib/rley/sppf/leaf_node.rb +3 -3
- data/lib/rley/sppf/parse_forest.rb +16 -16
- data/lib/rley/sppf/sppf_node.rb +7 -8
- data/lib/rley/sppf/token_node.rb +3 -3
- data/lib/rley/syntax/grammar.rb +5 -5
- data/lib/rley/syntax/grammar_builder.rb +9 -9
- data/lib/rley/syntax/grm_symbol.rb +6 -6
- data/lib/rley/syntax/non_terminal.rb +9 -15
- data/lib/rley/syntax/production.rb +10 -10
- data/lib/rley/syntax/symbol_seq.rb +7 -9
- data/lib/rley/syntax/terminal.rb +4 -5
- data/lib/rley/syntax/verbatim_symbol.rb +3 -3
- data/lib/support/base_tokenizer.rb +19 -18
- data/spec/rley/base/dotted_item_spec.rb +2 -2
- data/spec/rley/engine_spec.rb +17 -15
- data/spec/rley/formatter/asciitree_spec.rb +7 -7
- data/spec/rley/formatter/bracket_notation_spec.rb +13 -13
- data/spec/rley/formatter/json_spec.rb +1 -1
- data/spec/rley/gfg/end_vertex_spec.rb +5 -5
- data/spec/rley/gfg/item_vertex_spec.rb +10 -10
- data/spec/rley/gfg/non_terminal_vertex_spec.rb +3 -3
- data/spec/rley/gfg/shortcut_edge_spec.rb +1 -1
- data/spec/rley/gfg/start_vertex_spec.rb +5 -5
- data/spec/rley/gfg/vertex_spec.rb +3 -3
- data/spec/rley/lexical/token_range_spec.rb +16 -16
- data/spec/rley/lexical/token_spec.rb +2 -2
- data/spec/rley/parse_forest_visitor_spec.rb +165 -163
- data/spec/rley/parse_rep/ambiguous_parse_spec.rb +44 -44
- data/spec/rley/parse_rep/ast_builder_spec.rb +6 -6
- data/spec/rley/parse_rep/cst_builder_spec.rb +5 -5
- data/spec/rley/parse_rep/groucho_spec.rb +21 -21
- data/spec/rley/parse_rep/parse_forest_builder_spec.rb +26 -26
- data/spec/rley/parse_rep/parse_forest_factory_spec.rb +6 -6
- data/spec/rley/parse_rep/parse_tree_factory_spec.rb +2 -2
- data/spec/rley/parse_tree_visitor_spec.rb +10 -8
- data/spec/rley/parser/error_reason_spec.rb +6 -6
- data/spec/rley/parser/gfg_earley_parser_spec.rb +4 -2
- data/spec/rley/parser/gfg_parsing_spec.rb +4 -8
- data/spec/rley/parser/parse_entry_spec.rb +19 -19
- data/spec/rley/parser/parse_state_spec.rb +5 -5
- data/spec/rley/parser/parse_walker_factory_spec.rb +1 -1
- data/spec/rley/parser/state_set_spec.rb +22 -22
- data/spec/rley/ptree/non_terminal_node_spec.rb +5 -3
- data/spec/rley/ptree/parse_tree_node_spec.rb +4 -4
- data/spec/rley/ptree/terminal_node_spec.rb +6 -6
- data/spec/rley/sppf/alternative_node_spec.rb +6 -6
- data/spec/rley/sppf/non_terminal_node_spec.rb +3 -3
- data/spec/rley/sppf/token_node_spec.rb +4 -4
- data/spec/rley/support/ambiguous_grammar_helper.rb +3 -4
- data/spec/rley/support/grammar_abc_helper.rb +2 -4
- data/spec/rley/support/grammar_ambig01_helper.rb +4 -5
- data/spec/rley/support/grammar_arr_int_helper.rb +4 -5
- data/spec/rley/support/grammar_b_expr_helper.rb +4 -5
- data/spec/rley/support/grammar_l0_helper.rb +10 -11
- data/spec/rley/support/grammar_pb_helper.rb +6 -5
- data/spec/rley/support/grammar_sppf_helper.rb +1 -1
- data/spec/rley/syntax/grammar_builder_spec.rb +5 -5
- data/spec/rley/syntax/grammar_spec.rb +6 -6
- data/spec/rley/syntax/grm_symbol_spec.rb +1 -1
- data/spec/rley/syntax/non_terminal_spec.rb +8 -8
- data/spec/rley/syntax/production_spec.rb +13 -13
- data/spec/rley/syntax/symbol_seq_spec.rb +2 -2
- data/spec/rley/syntax/terminal_spec.rb +5 -5
- data/spec/rley/syntax/verbatim_symbol_spec.rb +1 -1
- data/spec/spec_helper.rb +0 -12
- data/spec/support/base_tokenizer_spec.rb +7 -2
- metadata +21 -62
- data/.simplecov +0 -8
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -90,7 +90,7 @@ directory
|
|
90
90
|
|
91
91
|
# Let's create a facade object called 'engine'
|
92
92
|
# It provides a unified, higher-level interface
|
93
|
-
engine = Rley
|
93
|
+
engine = Rley::Engine.new
|
94
94
|
```
|
95
95
|
|
96
96
|
|
@@ -446,6 +446,7 @@ actively curated by Andrei Beliankou (aka arbox).
|
|
446
446
|
## Thanks to:
|
447
447
|
* Professor Keshav Pingali, one of the creators of the Grammar Flow Graph parsing approach for his encouraging e-mail exchange.
|
448
448
|
* [Arjun Menon](https://github.com/arjunmenon) for his NLP example that uses `engtagger` gem.
|
449
|
+
* [Gui Heurich](https://github.com/GuiHeurich) for spotting a mistake in the code sample in `README` file.
|
449
450
|
|
450
451
|
## Grammar Flow Graph
|
451
452
|
Since the Grammar Flow Graph parsing approach is quite new, it has not yet taken a place in
|
@@ -458,5 +459,5 @@ standard parser textbooks. Here are a few references (and links) of papers on GF
|
|
458
459
|
|
459
460
|
Copyright
|
460
461
|
---------
|
461
|
-
Copyright (c) 2014-
|
462
|
+
Copyright (c) 2014-2020, Dimitri Geshef.
|
462
463
|
__Rley__ is released under the MIT License see [LICENSE.txt](https://github.com/famished-tiger/Rley/blob/master/LICENSE.txt) for details.
|
data/examples/NLP/engtagger.rb
CHANGED
@@ -1,190 +1,193 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'rley'
|
4
|
-
require 'engtagger' # Load POS (Part-Of-Speech) tagger EngTagger
|
5
|
-
|
6
|
-
# REGEX to remove XML tags from Engtagger output
|
7
|
-
GET_TAG = /<(.+?)>(.*?)<.+?>/.freeze
|
8
|
-
|
9
|
-
# Text tokenizer
|
10
|
-
# Taken directly from Engtagger, will ensure uniform indexing while parsing
|
11
|
-
def clean_text(text)
|
12
|
-
return false unless valid_text(text)
|
13
|
-
|
14
|
-
text = text.toutf8
|
15
|
-
cleaned_text = text
|
16
|
-
tokenized = []
|
17
|
-
# Tokenize the text (splitting on punctuation as you go)
|
18
|
-
cleaned_text.split(/\s+/).each do |line|
|
19
|
-
tokenized += split_punct(line)
|
20
|
-
end
|
21
|
-
words = split_sentences(tokenized)
|
22
|
-
return words
|
23
|
-
end
|
24
|
-
|
25
|
-
def valid_text(text)
|
26
|
-
if !text
|
27
|
-
# there's nothing to parse
|
28
|
-
puts 'method call on uninitialized variable'
|
29
|
-
return false
|
30
|
-
elsif /\A\s*\z/ =~ text
|
31
|
-
# text is an empty string, nothing to parse
|
32
|
-
return false
|
33
|
-
else
|
34
|
-
# $text is valid
|
35
|
-
return true
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def split_sentences(array)
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
end
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
end
|
85
|
-
|
86
|
-
|
87
|
-
#
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
#
|
93
|
-
text
|
94
|
-
|
95
|
-
#
|
96
|
-
text = text.gsub(
|
97
|
-
|
98
|
-
|
99
|
-
#
|
100
|
-
text = text.gsub(/(
|
101
|
-
|
102
|
-
|
103
|
-
#
|
104
|
-
text = text.gsub(/
|
105
|
-
|
106
|
-
#
|
107
|
-
text = text.gsub(
|
108
|
-
|
109
|
-
|
110
|
-
text = text.gsub(
|
111
|
-
text = text.gsub(
|
112
|
-
|
113
|
-
|
114
|
-
text = text.gsub(/([
|
115
|
-
|
116
|
-
#
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
#
|
130
|
-
nlp_engine.
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
rule '
|
140
|
-
rule '
|
141
|
-
rule '
|
142
|
-
rule '
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
#
|
155
|
-
|
156
|
-
|
157
|
-
lexicon
|
158
|
-
|
159
|
-
#
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
#
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rley'
|
4
|
+
require 'engtagger' # Load POS (Part-Of-Speech) tagger EngTagger
|
5
|
+
|
6
|
+
# REGEX to remove XML tags from Engtagger output
|
7
|
+
GET_TAG = /<(.+?)>(.*?)<.+?>/.freeze
|
8
|
+
|
9
|
+
# Text tokenizer
|
10
|
+
# Taken directly from Engtagger, will ensure uniform indexing while parsing
|
11
|
+
def clean_text(text)
|
12
|
+
return false unless valid_text(text)
|
13
|
+
|
14
|
+
text = text.toutf8
|
15
|
+
cleaned_text = text
|
16
|
+
tokenized = []
|
17
|
+
# Tokenize the text (splitting on punctuation as you go)
|
18
|
+
cleaned_text.split(/\s+/).each do |line|
|
19
|
+
tokenized += split_punct(line)
|
20
|
+
end
|
21
|
+
words = split_sentences(tokenized)
|
22
|
+
return words
|
23
|
+
end
|
24
|
+
|
25
|
+
def valid_text(text)
|
26
|
+
if !text
|
27
|
+
# there's nothing to parse
|
28
|
+
puts 'method call on uninitialized variable'
|
29
|
+
return false
|
30
|
+
elsif /\A\s*\z/ =~ text
|
31
|
+
# text is an empty string, nothing to parse
|
32
|
+
return false
|
33
|
+
else
|
34
|
+
# $text is valid
|
35
|
+
return true
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def split_sentences(array)
|
40
|
+
# rubocop: disable Layout/ArrayAlignment
|
41
|
+
tokenized = array
|
42
|
+
people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
|
43
|
+
supt det mssrs rev]
|
44
|
+
army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
|
45
|
+
inst = %w[dept univ assn bros ph.d]
|
46
|
+
place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
|
47
|
+
hwy hway la pde pd plz pl rd st tce]
|
48
|
+
comp = %w[mfg inc ltd co corp]
|
49
|
+
state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
|
50
|
+
ind ia kans kan ken ky la me md is mass mich minn miss mo mont
|
51
|
+
neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
|
52
|
+
va wash wis wisc wy wyo usafa alta man ont que sask yuk]
|
53
|
+
month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
|
54
|
+
misc = %w[vs etc no esp]
|
55
|
+
abbr = {}
|
56
|
+
[people, army, inst, place, comp, state, month, misc].flatten.each do |i|
|
57
|
+
abbr[i] = true
|
58
|
+
end
|
59
|
+
words = []
|
60
|
+
tokenized.each_with_index do |_t, i|
|
61
|
+
if tokenized[i + 1] &&
|
62
|
+
tokenized[i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
|
63
|
+
w = $1
|
64
|
+
# Don't separate the period off words that
|
65
|
+
# meet any of the following conditions:
|
66
|
+
#
|
67
|
+
# 1. It is defined in one of the lists above
|
68
|
+
# 2. It is only one letter long: Alfred E. Sloan
|
69
|
+
# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
|
70
|
+
unless abbr[w.downcase] ||
|
71
|
+
w =~ /\A[a-z]\z/i || w =~ /[a-z](?:\.[a-z])+\z/i
|
72
|
+
words << w
|
73
|
+
words << '.'
|
74
|
+
next
|
75
|
+
end
|
76
|
+
end
|
77
|
+
words << tokenized[i]
|
78
|
+
end
|
79
|
+
|
80
|
+
# If the final word ends in a period..
|
81
|
+
if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
|
82
|
+
words[-1] = $1
|
83
|
+
words.push '.'
|
84
|
+
end
|
85
|
+
words
|
86
|
+
end
|
87
|
+
# rubocop: enable Layout/ArrayAlignment
|
88
|
+
|
89
|
+
# Separate punctuation from words, where appropriate. This leaves trailing
|
90
|
+
# periods in place to be dealt with later. Called by the clean_text method.
|
91
|
+
def split_punct(text)
|
92
|
+
# If there's no punctuation, return immediately
|
93
|
+
return [text] if /\A\w+\z/ =~ text
|
94
|
+
|
95
|
+
# Sanity checks
|
96
|
+
text = text.gsub(/\W{10,}/o, ' ')
|
97
|
+
|
98
|
+
# Put quotes into a standard format
|
99
|
+
text = text.gsub(/`(?!`)(?=.*\w)/o, '` ') # Shift left quotes off text
|
100
|
+
text = text.gsub(/"(?=.*\w)/o, ' `` ') # Convert left quotes to ``
|
101
|
+
|
102
|
+
# Convert left quote to `
|
103
|
+
text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? "#{$1} ` " : ' ` ' }
|
104
|
+
text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
|
105
|
+
|
106
|
+
# Separate right single quotes
|
107
|
+
text = text.gsub(/(\w)'(?!')(?=\W|$)/o, "\\1 ' ")
|
108
|
+
|
109
|
+
# Handle all other punctuation
|
110
|
+
text = text.gsub(/--+/o, ' - ') # Convert and separate dashes
|
111
|
+
text = text.gsub(/,(?!\d)/o, ' , ') # Shift comma if not following by digit
|
112
|
+
text = text.gsub(/:/o, ' :') # Shift semicolon off
|
113
|
+
text = text.gsub(/(\.\.\.+)/o, ' \1 ') # Shift ellipses off
|
114
|
+
text = text.gsub(/([(\[{}\])])/o, ' \1 ') # Shift off brackets
|
115
|
+
|
116
|
+
# Shift off other ``standard'' punctuation
|
117
|
+
text = text.gsub(/([!?#$%;~|])/o, ' \1 ')
|
118
|
+
|
119
|
+
# English-specific contractions
|
120
|
+
# Separate off 'd 'm 's
|
121
|
+
text = text.gsub(/([A-Za-z])'([dms])\b/o, "\\1 '\\2")
|
122
|
+
text = text.gsub(/n't\b/o, " n't") # Separate off n't
|
123
|
+
text = text.gsub(/'(ve|ll|re)\b/o, " '\\1") # Separate off 've, 'll, 're
|
124
|
+
result = text.split
|
125
|
+
return result
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
# Instantiate a facade object as our Rley interface
|
130
|
+
nlp_engine = Rley::Engine.new
|
131
|
+
|
132
|
+
# Now build a very simplified English grammar...
|
133
|
+
nlp_engine.build_grammar do
|
134
|
+
# Terminals have same names as POS tags returned by Engtagger
|
135
|
+
add_terminals('NN', 'NNP')
|
136
|
+
add_terminals('DET', 'IN', 'VBD')
|
137
|
+
|
138
|
+
# Here we define the productions (= grammar rules)
|
139
|
+
rule 'S' => %w[NP VP]
|
140
|
+
rule 'NP' => 'NNP'
|
141
|
+
rule 'NP' => %w[DET NN]
|
142
|
+
rule 'NP' => %w[DET NN PP]
|
143
|
+
rule 'VP' => %w[VBD NP]
|
144
|
+
rule 'VP' => %w[VBD NP PP]
|
145
|
+
rule 'PP' => %w[IN NP]
|
146
|
+
end
|
147
|
+
|
148
|
+
# text = "Yo I'm not done with you"
|
149
|
+
text = 'John saw Mary with a telescope'
|
150
|
+
puts "Input text --> #{text}"
|
151
|
+
|
152
|
+
tgr = EngTagger.new
|
153
|
+
|
154
|
+
# Generate raw POS output
|
155
|
+
tagged = tgr.add_tags(text)
|
156
|
+
|
157
|
+
# Generte tokenied lexicon of input text
|
158
|
+
# Instead of creating a lexicon dictionary,
|
159
|
+
# we would simply generate one each time on the fly for the current text only.
|
160
|
+
lexicon = clean_text(text)
|
161
|
+
|
162
|
+
# Convert EngTagger POS tokens in [[word, pos], ..] format
|
163
|
+
tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
|
164
|
+
|
165
|
+
def tokenizer(lexicon, tokens)
|
166
|
+
pos = -1
|
167
|
+
rley_tokens = []
|
168
|
+
lexicon.each_with_index do |word, i|
|
169
|
+
term_name = tokens[i].last
|
170
|
+
rank = Rley::Lexical::Position.new(1, pos + 1)
|
171
|
+
pos += word.length + 1 # Assuming one space between words.
|
172
|
+
rley_tokens << Rley::Lexical::Token.new(word, term_name, rank)
|
173
|
+
end
|
174
|
+
return rley_tokens
|
175
|
+
end
|
176
|
+
|
177
|
+
# Convert input text into a sequence of rley token objects...
|
178
|
+
rley_tokens = tokenizer(lexicon, tokens)
|
179
|
+
|
180
|
+
# Let Rley grok the tokens
|
181
|
+
result = nlp_engine.parse(rley_tokens)
|
182
|
+
|
183
|
+
puts "Parsing successful? #{result.success?}" # => Parsing successful? true
|
184
|
+
puts result.failure_reason.message unless result.success?
|
185
|
+
|
186
|
+
ptree = nlp_engine.convert(result)
|
187
|
+
|
188
|
+
visitor = nlp_engine.ptree_visitor(ptree)
|
189
|
+
|
190
|
+
renderer = Rley::Formatter::Asciitree.new($stdout)
|
191
|
+
|
192
|
+
# Let's visualize the parse tree (in text format...)
|
193
|
+
puts renderer.render(visitor)
|
@@ -11,7 +11,7 @@ require 'rley' # Load the gem
|
|
11
11
|
# based on chapter 12 from Jurafski & Martin book.
|
12
12
|
# Daniel Jurafsky, James H. Martin: "Speech and Language Processing";
|
13
13
|
# 2009, Pearson Education, Inc., ISBN 978-0135041963
|
14
|
-
# It defines the syntax of a sentence in a mini English-like language
|
14
|
+
# It defines the syntax of a sentence in a mini English-like language
|
15
15
|
builder = Rley::Syntax::GrammarBuilder.new do
|
16
16
|
add_terminals('Pronoun', 'Proper-Noun')
|
17
17
|
add_terminals('Determiner', 'Noun')
|
@@ -21,7 +21,7 @@ builder = Rley::Syntax::GrammarBuilder.new do
|
|
21
21
|
|
22
22
|
rule 'language' => 'sentence'
|
23
23
|
rule 'sentence' => 'declarative'
|
24
|
-
rule 'sentence' => 'imperative'
|
24
|
+
rule 'sentence' => 'imperative'
|
25
25
|
rule 'sentence' => 'yes_no_question'
|
26
26
|
rule 'sentence' => 'wh_subject_question'
|
27
27
|
rule 'sentence' => 'wh_non_subject_question'
|
@@ -33,7 +33,7 @@ builder = Rley::Syntax::GrammarBuilder.new do
|
|
33
33
|
rule 'NP' => %w[Predeterminer NP]
|
34
34
|
rule 'NP' => 'Pronoun'
|
35
35
|
rule 'NP' => 'Proper-Noun'
|
36
|
-
rule 'NP' => %w[Det Card Ord Quant Nominal]
|
36
|
+
rule 'NP' => %w[Det Card Ord Quant Nominal]
|
37
37
|
rule 'VP' => 'Verb'
|
38
38
|
rule 'VP' => %w[Verb NP]
|
39
39
|
rule 'VP' => %w[Verb NP PP]
|
@@ -43,12 +43,12 @@ builder = Rley::Syntax::GrammarBuilder.new do
|
|
43
43
|
rule 'Card' => 'Cardinal_number'
|
44
44
|
rule 'Card' => []
|
45
45
|
rule 'Ord' => 'Ordinal_number'
|
46
|
-
rule 'Ord' => []
|
46
|
+
rule 'Ord' => []
|
47
47
|
rule 'Nominal' => 'Noun'
|
48
48
|
rule 'Nominal' => %w[Nominal Noun]
|
49
49
|
rule 'Nominal' => %w[Nominal GerundVP]
|
50
50
|
rule 'Nominal' => %w[Nominal RelClause]
|
51
|
-
rule 'PP' => %w[Preposition NP]
|
51
|
+
rule 'PP' => %w[Preposition NP]
|
52
52
|
rule 'GerundVP' => 'GerundV'
|
53
53
|
rule 'GerundVP' => %w[GerundV NP]
|
54
54
|
rule 'GerundVP' => %w[GerundV NP PP]
|