rley 0.7.06 → 0.8.01
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +362 -62
- data/.travis.yml +6 -6
- data/CHANGELOG.md +20 -4
- data/LICENSE.txt +1 -1
- data/README.md +7 -7
- data/examples/NLP/engtagger.rb +193 -190
- data/examples/NLP/nano_eng/nano_en_demo.rb +7 -11
- data/examples/NLP/nano_eng/nano_grammar.rb +21 -21
- data/examples/NLP/pico_en_demo.rb +2 -2
- data/examples/data_formats/JSON/cli_options.rb +1 -1
- data/examples/data_formats/JSON/json_ast_builder.rb +21 -27
- data/examples/data_formats/JSON/json_ast_nodes.rb +12 -21
- data/examples/data_formats/JSON/json_demo.rb +1 -2
- data/examples/data_formats/JSON/json_grammar.rb +13 -13
- data/examples/data_formats/JSON/json_lexer.rb +8 -8
- data/examples/data_formats/JSON/json_minifier.rb +1 -1
- data/examples/general/calc_iter1/calc_ast_builder.rb +13 -10
- data/examples/general/calc_iter1/calc_ast_nodes.rb +23 -37
- data/examples/general/calc_iter1/calc_grammar.rb +7 -6
- data/examples/general/calc_iter1/calc_lexer.rb +6 -4
- data/examples/general/calc_iter1/spec/calculator_spec.rb +5 -5
- data/examples/general/calc_iter2/calc_ast_builder.rb +5 -3
- data/examples/general/calc_iter2/calc_ast_nodes.rb +27 -43
- data/examples/general/calc_iter2/calc_grammar.rb +12 -12
- data/examples/general/calc_iter2/calc_lexer.rb +11 -10
- data/examples/general/calc_iter2/spec/calculator_spec.rb +26 -26
- data/examples/general/left.rb +2 -2
- data/examples/general/right.rb +2 -2
- data/lib/rley.rb +1 -1
- data/lib/rley/base/dotted_item.rb +28 -31
- data/lib/rley/base/grm_items_builder.rb +6 -0
- data/lib/rley/constants.rb +2 -2
- data/lib/rley/engine.rb +22 -25
- data/lib/rley/formatter/asciitree.rb +3 -3
- data/lib/rley/formatter/bracket_notation.rb +1 -8
- data/lib/rley/formatter/debug.rb +6 -6
- data/lib/rley/formatter/json.rb +2 -2
- data/lib/rley/gfg/call_edge.rb +1 -1
- data/lib/rley/gfg/edge.rb +5 -5
- data/lib/rley/gfg/end_vertex.rb +2 -6
- data/lib/rley/gfg/epsilon_edge.rb +1 -5
- data/lib/rley/gfg/grm_flow_graph.rb +27 -23
- data/lib/rley/gfg/item_vertex.rb +10 -10
- data/lib/rley/gfg/non_terminal_vertex.rb +4 -4
- data/lib/rley/gfg/scan_edge.rb +1 -1
- data/lib/rley/gfg/shortcut_edge.rb +2 -2
- data/lib/rley/gfg/start_vertex.rb +4 -8
- data/lib/rley/gfg/vertex.rb +43 -39
- data/lib/rley/interface.rb +16 -0
- data/lib/rley/lexical/token_range.rb +6 -6
- data/lib/rley/notation/all_notation_nodes.rb +2 -0
- data/lib/rley/notation/ast_builder.rb +191 -0
- data/lib/rley/notation/ast_node.rb +44 -0
- data/lib/rley/notation/ast_visitor.rb +113 -0
- data/lib/rley/notation/grammar.rb +49 -0
- data/lib/rley/notation/grammar_builder.rb +504 -0
- data/lib/rley/notation/grouping_node.rb +23 -0
- data/lib/rley/notation/parser.rb +56 -0
- data/lib/rley/notation/sequence_node.rb +35 -0
- data/lib/rley/notation/symbol_node.rb +29 -0
- data/lib/rley/notation/tokenizer.rb +192 -0
- data/lib/rley/parse_forest_visitor.rb +5 -5
- data/lib/rley/parse_rep/ast_base_builder.rb +48 -11
- data/lib/rley/parse_rep/cst_builder.rb +5 -6
- data/lib/rley/parse_rep/parse_forest_builder.rb +22 -18
- data/lib/rley/parse_rep/parse_forest_factory.rb +3 -3
- data/lib/rley/parse_rep/parse_rep_creator.rb +14 -16
- data/lib/rley/parse_rep/parse_tree_builder.rb +4 -4
- data/lib/rley/parse_rep/parse_tree_factory.rb +27 -27
- data/lib/rley/parse_tree_visitor.rb +1 -1
- data/lib/rley/parser/error_reason.rb +4 -5
- data/lib/rley/parser/gfg_chart.rb +118 -26
- data/lib/rley/parser/gfg_parsing.rb +22 -33
- data/lib/rley/parser/parse_entry.rb +25 -31
- data/lib/rley/parser/parse_entry_set.rb +19 -16
- data/lib/rley/parser/parse_entry_tracker.rb +4 -4
- data/lib/rley/parser/parse_tracer.rb +13 -13
- data/lib/rley/parser/parse_walker_factory.rb +23 -28
- data/lib/rley/ptree/non_terminal_node.rb +7 -5
- data/lib/rley/ptree/parse_tree.rb +3 -3
- data/lib/rley/ptree/parse_tree_node.rb +5 -5
- data/lib/rley/ptree/terminal_node.rb +7 -7
- data/lib/rley/rley_error.rb +12 -12
- data/lib/rley/sppf/alternative_node.rb +6 -6
- data/lib/rley/sppf/composite_node.rb +7 -7
- data/lib/rley/sppf/epsilon_node.rb +3 -3
- data/lib/rley/sppf/leaf_node.rb +3 -3
- data/lib/rley/sppf/parse_forest.rb +16 -16
- data/lib/rley/sppf/sppf_node.rb +7 -8
- data/lib/rley/sppf/token_node.rb +3 -3
- data/lib/rley/syntax/{grammar_builder.rb → base_grammar_builder.rb} +61 -23
- data/lib/rley/syntax/grammar.rb +5 -5
- data/lib/rley/syntax/grm_symbol.rb +7 -7
- data/lib/rley/syntax/match_closest.rb +43 -0
- data/lib/rley/syntax/non_terminal.rb +9 -15
- data/lib/rley/syntax/production.rb +16 -10
- data/lib/rley/syntax/symbol_seq.rb +7 -9
- data/lib/rley/syntax/terminal.rb +4 -5
- data/lib/rley/syntax/verbatim_symbol.rb +3 -3
- data/lib/support/base_tokenizer.rb +19 -18
- data/spec/rley/base/dotted_item_spec.rb +2 -2
- data/spec/rley/engine_spec.rb +23 -21
- data/spec/rley/formatter/asciitree_spec.rb +7 -7
- data/spec/rley/formatter/bracket_notation_spec.rb +13 -13
- data/spec/rley/formatter/json_spec.rb +1 -1
- data/spec/rley/gfg/end_vertex_spec.rb +5 -5
- data/spec/rley/gfg/grm_flow_graph_spec.rb +2 -2
- data/spec/rley/gfg/item_vertex_spec.rb +10 -10
- data/spec/rley/gfg/non_terminal_vertex_spec.rb +3 -3
- data/spec/rley/gfg/shortcut_edge_spec.rb +1 -1
- data/spec/rley/gfg/start_vertex_spec.rb +5 -5
- data/spec/rley/gfg/vertex_spec.rb +3 -3
- data/spec/rley/lexical/token_range_spec.rb +16 -16
- data/spec/rley/lexical/token_spec.rb +2 -2
- data/spec/rley/notation/grammar_builder_spec.rb +302 -0
- data/spec/rley/notation/parser_spec.rb +184 -0
- data/spec/rley/notation/tokenizer_spec.rb +370 -0
- data/spec/rley/parse_forest_visitor_spec.rb +165 -163
- data/spec/rley/parse_rep/ambiguous_parse_spec.rb +44 -44
- data/spec/rley/parse_rep/ast_builder_spec.rb +6 -7
- data/spec/rley/parse_rep/cst_builder_spec.rb +5 -5
- data/spec/rley/parse_rep/groucho_spec.rb +24 -26
- data/spec/rley/parse_rep/parse_forest_builder_spec.rb +27 -27
- data/spec/rley/parse_rep/parse_forest_factory_spec.rb +8 -8
- data/spec/rley/parse_rep/parse_tree_factory_spec.rb +3 -3
- data/spec/rley/parse_tree_visitor_spec.rb +10 -8
- data/spec/rley/parser/dangling_else_spec.rb +445 -0
- data/spec/rley/parser/error_reason_spec.rb +6 -6
- data/spec/rley/parser/gfg_earley_parser_spec.rb +120 -12
- data/spec/rley/parser/gfg_parsing_spec.rb +6 -13
- data/spec/rley/parser/parse_entry_spec.rb +19 -19
- data/spec/rley/parser/parse_walker_factory_spec.rb +10 -10
- data/spec/rley/ptree/non_terminal_node_spec.rb +5 -3
- data/spec/rley/ptree/parse_tree_node_spec.rb +4 -4
- data/spec/rley/ptree/terminal_node_spec.rb +6 -6
- data/spec/rley/sppf/alternative_node_spec.rb +6 -6
- data/spec/rley/sppf/non_terminal_node_spec.rb +3 -3
- data/spec/rley/sppf/token_node_spec.rb +4 -4
- data/spec/rley/support/ambiguous_grammar_helper.rb +4 -5
- data/spec/rley/support/grammar_abc_helper.rb +3 -5
- data/spec/rley/support/grammar_ambig01_helper.rb +5 -6
- data/spec/rley/support/grammar_arr_int_helper.rb +5 -6
- data/spec/rley/support/grammar_b_expr_helper.rb +5 -6
- data/spec/rley/support/grammar_int_seq_helper.rb +51 -0
- data/spec/rley/support/grammar_l0_helper.rb +14 -17
- data/spec/rley/support/grammar_pb_helper.rb +8 -7
- data/spec/rley/support/grammar_sppf_helper.rb +3 -3
- data/spec/rley/syntax/{grammar_builder_spec.rb → base_grammar_builder_spec.rb} +35 -16
- data/spec/rley/syntax/grammar_spec.rb +6 -6
- data/spec/rley/syntax/grm_symbol_spec.rb +1 -1
- data/spec/rley/syntax/match_closest_spec.rb +46 -0
- data/spec/rley/syntax/non_terminal_spec.rb +8 -8
- data/spec/rley/syntax/production_spec.rb +17 -13
- data/spec/rley/syntax/symbol_seq_spec.rb +2 -2
- data/spec/rley/syntax/terminal_spec.rb +5 -5
- data/spec/rley/syntax/verbatim_symbol_spec.rb +1 -1
- data/spec/spec_helper.rb +0 -12
- data/spec/support/base_tokenizer_spec.rb +7 -2
- metadata +48 -74
- data/.simplecov +0 -7
- data/lib/rley/parser/parse_state.rb +0 -83
- data/lib/rley/parser/parse_state_tracker.rb +0 -59
- data/lib/rley/parser/state_set.rb +0 -101
- data/spec/rley/parser/parse_state_spec.rb +0 -125
- data/spec/rley/parser/parse_tracer_spec.rb +0 -200
- data/spec/rley/parser/state_set_spec.rb +0 -130
data/.travis.yml
CHANGED
@@ -9,13 +9,13 @@ script:
|
|
9
9
|
- bundle exec rake
|
10
10
|
|
11
11
|
rvm:
|
12
|
-
- 2.
|
13
|
-
- 2.
|
14
|
-
- 2.
|
15
|
-
- 2.
|
12
|
+
- 2.7.1
|
13
|
+
- 2.6.6
|
14
|
+
- 2.5.8
|
15
|
+
- 2.4.10
|
16
16
|
- ruby-head
|
17
|
-
- jruby-
|
18
|
-
|
17
|
+
- jruby-head
|
18
|
+
before_install: gem install bundler -v 2.0.2
|
19
19
|
|
20
20
|
matrix:
|
21
21
|
allow_failures:
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
### 0.8.01 / 2021-08-22
|
2
|
+
- Unused/redundant file removal. Fix in rule generation
|
3
|
+
|
4
|
+
* [CHANGE] Removal of files in repository that were redundant/useless.
|
5
|
+
* [FIX] The rule ordering was broken by the rules implicitly generated by Rlry
|
6
|
+
|
7
|
+
### 0.8.00 / 2021-08-15
|
8
|
+
- New grammar builder that accepts ? * + modifiers
|
9
|
+
|
10
|
+
### 0.7.08 / 2021-05-30
|
11
|
+
- Code restyling to please rubocop 1.15.0
|
12
|
+
|
13
|
+
### 0.7.07 / 2020-11-16
|
14
|
+
- Code restyling to please rubocop 0.93.1
|
15
|
+
* [CHANGE] File `.travis.yml`: updated Ruby versions, drop support for Ruby 2.3.x
|
16
|
+
|
1
17
|
### 0.7.06 / 2019-11-22
|
2
18
|
- [FIX] Method `ParseForestBuilder#process_end_entry`: Added a guard expression to prevent nil error.
|
3
19
|
|
@@ -5,10 +21,10 @@
|
|
5
21
|
- [FIX] Method `GFGParsing#nullable_rule`: issue with nullable productions having at least one member in their rhs.
|
6
22
|
|
7
23
|
### 0.7.04 / 2019-08-17
|
8
|
-
- Rley recognizer is about 25% faster than previous version. Kudos to the people
|
9
|
-
behind the *magic_frozen_string_literal* gem.
|
24
|
+
- Rley recognizer is about 25% faster than previous version. Kudos to the people
|
25
|
+
behind the *magic_frozen_string_literal* gem.
|
10
26
|
- Code refactoring to use string frozen magic comments (as a consequence, Rley runs only on Rubies 2.3 or newer).
|
11
|
-
- Code restyling to please rubocop 0.7.40.
|
27
|
+
- Code restyling to please rubocop 0.7.40.
|
12
28
|
- [CHANGE] Class `ParseEntrySet`: minor code optimization
|
13
29
|
- [CHANGE] File `README.md` removed allusion to Ruby 2.0.x up to 2.2.x.
|
14
30
|
- [CHANGE] File `README.md` added Ruby 2.6.x up as supported version.
|
@@ -93,7 +109,7 @@ behind the *magic_frozen_string_literal* gem.
|
|
93
109
|
* [FIX] Code re-styling to remove most style offenses found by Rubocop 0.52.1
|
94
110
|
|
95
111
|
### 0.6.00 / 2018-02-25
|
96
|
-
Version bump. Highlights: new programming interface through facade object, improved AST generation.
|
112
|
+
Version bump. Highlights: new programming interface through facade object, improved AST generation.
|
97
113
|
* [NEW] Class `Rley::Engine`: Implementation of Facade design pattern to reach more convenient interface.
|
98
114
|
* [NEW] Class `Rley::ParseRep::ASTBaseBuilder` Abstract class that simplifies the creation of custom AST (Abstract Syntax Tree)
|
99
115
|
* [NEW] Module `Rley::ParseRep` hosts the classes for building parse representations (parse trees and forests)
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -50,11 +50,10 @@ application range such as:
|
|
50
50
|
|
51
51
|
|
52
52
|
### Compatibility
|
53
|
-
Rley supports the following Ruby implementations:
|
54
|
-
- MRI 2.3
|
55
|
-
- MRI 2.4
|
53
|
+
Rley supports the following Ruby implementations:
|
56
54
|
- MRI 2.5
|
57
55
|
- MRI 2.6
|
56
|
+
- MRI 2.7
|
58
57
|
- JRuby 9.1+
|
59
58
|
|
60
59
|
---
|
@@ -90,7 +89,7 @@ directory
|
|
90
89
|
|
91
90
|
# Let's create a facade object called 'engine'
|
92
91
|
# It provides a unified, higher-level interface
|
93
|
-
engine = Rley
|
92
|
+
engine = Rley::Engine.new
|
94
93
|
```
|
95
94
|
|
96
95
|
|
@@ -106,9 +105,9 @@ The subset of English grammar is based on an example from the NLTK book.
|
|
106
105
|
# Here we define the productions (= grammar rules)
|
107
106
|
rule 'S' => 'NP VP'
|
108
107
|
rule 'NP' => 'Proper-Noun'
|
109
|
-
rule 'NP' => 'Determiner Noun'
|
108
|
+
rule 'NP' => 'Determiner Noun'
|
110
109
|
rule 'NP' => 'Determiner Noun PP'
|
111
|
-
rule 'VP' => 'Verb NP'
|
110
|
+
rule 'VP' => 'Verb NP'
|
112
111
|
rule 'VP' => 'Verb NP PP'
|
113
112
|
rule 'PP' => 'Preposition NP'
|
114
113
|
end
|
@@ -446,6 +445,7 @@ actively curated by Andrei Beliankou (aka arbox).
|
|
446
445
|
## Thanks to:
|
447
446
|
* Professor Keshav Pingali, one of the creators of the Grammar Flow Graph parsing approach for his encouraging e-mail exchange.
|
448
447
|
* [Arjun Menon](https://github.com/arjunmenon) for his NLP example that uses `engtagger` gem.
|
448
|
+
* [Gui Heurich](https://github.com/GuiHeurich) for spotting a mistake in the code sample in `README` file.
|
449
449
|
|
450
450
|
## Grammar Flow Graph
|
451
451
|
Since the Grammar Flow Graph parsing approach is quite new, it has not yet taken a place in
|
@@ -458,5 +458,5 @@ standard parser textbooks. Here are a few references (and links) of papers on GF
|
|
458
458
|
|
459
459
|
Copyright
|
460
460
|
---------
|
461
|
-
Copyright (c) 2014-
|
461
|
+
Copyright (c) 2014-2020, Dimitri Geshef.
|
462
462
|
__Rley__ is released under the MIT License see [LICENSE.txt](https://github.com/famished-tiger/Rley/blob/master/LICENSE.txt) for details.
|
data/examples/NLP/engtagger.rb
CHANGED
@@ -1,190 +1,193 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'rley'
|
4
|
-
require 'engtagger' # Load POS (Part-Of-Speech) tagger EngTagger
|
5
|
-
|
6
|
-
# REGEX to remove XML tags from Engtagger output
|
7
|
-
GET_TAG = /<(.+?)>(.*?)<.+?>/.freeze
|
8
|
-
|
9
|
-
# Text tokenizer
|
10
|
-
# Taken directly from Engtagger, will ensure uniform indexing while parsing
|
11
|
-
def clean_text(text)
|
12
|
-
return false unless valid_text(text)
|
13
|
-
|
14
|
-
text = text.toutf8
|
15
|
-
cleaned_text = text
|
16
|
-
tokenized = []
|
17
|
-
# Tokenize the text (splitting on punctuation as you go)
|
18
|
-
cleaned_text.split(/\s+/).each do |line|
|
19
|
-
tokenized += split_punct(line)
|
20
|
-
end
|
21
|
-
words = split_sentences(tokenized)
|
22
|
-
return words
|
23
|
-
end
|
24
|
-
|
25
|
-
def valid_text(text)
|
26
|
-
if !text
|
27
|
-
# there's nothing to parse
|
28
|
-
puts 'method call on uninitialized variable'
|
29
|
-
return false
|
30
|
-
elsif /\A\s*\z/ =~ text
|
31
|
-
# text is an empty string, nothing to parse
|
32
|
-
return false
|
33
|
-
else
|
34
|
-
# $text is valid
|
35
|
-
return true
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def split_sentences(array)
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
end
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
end
|
85
|
-
|
86
|
-
|
87
|
-
#
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
#
|
93
|
-
text
|
94
|
-
|
95
|
-
#
|
96
|
-
text = text.gsub(
|
97
|
-
|
98
|
-
|
99
|
-
#
|
100
|
-
text = text.gsub(/(
|
101
|
-
|
102
|
-
|
103
|
-
#
|
104
|
-
text = text.gsub(/
|
105
|
-
|
106
|
-
#
|
107
|
-
text = text.gsub(
|
108
|
-
|
109
|
-
|
110
|
-
text = text.gsub(
|
111
|
-
text = text.gsub(
|
112
|
-
|
113
|
-
|
114
|
-
text = text.gsub(/([
|
115
|
-
|
116
|
-
#
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
#
|
130
|
-
nlp_engine.
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
rule '
|
140
|
-
rule '
|
141
|
-
rule '
|
142
|
-
rule '
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
#
|
155
|
-
|
156
|
-
|
157
|
-
lexicon
|
158
|
-
|
159
|
-
#
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
#
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rley'
|
4
|
+
require 'engtagger' # Load POS (Part-Of-Speech) tagger EngTagger
|
5
|
+
|
6
|
+
# REGEX to remove XML tags from Engtagger output
|
7
|
+
GET_TAG = /<(.+?)>(.*?)<.+?>/.freeze
|
8
|
+
|
9
|
+
# Text tokenizer
|
10
|
+
# Taken directly from Engtagger, will ensure uniform indexing while parsing
|
11
|
+
def clean_text(text)
|
12
|
+
return false unless valid_text(text)
|
13
|
+
|
14
|
+
text = text.toutf8
|
15
|
+
cleaned_text = text
|
16
|
+
tokenized = []
|
17
|
+
# Tokenize the text (splitting on punctuation as you go)
|
18
|
+
cleaned_text.split(/\s+/).each do |line|
|
19
|
+
tokenized += split_punct(line)
|
20
|
+
end
|
21
|
+
words = split_sentences(tokenized)
|
22
|
+
return words
|
23
|
+
end
|
24
|
+
|
25
|
+
def valid_text(text)
|
26
|
+
if !text
|
27
|
+
# there's nothing to parse
|
28
|
+
puts 'method call on uninitialized variable'
|
29
|
+
return false
|
30
|
+
elsif /\A\s*\z/ =~ text
|
31
|
+
# text is an empty string, nothing to parse
|
32
|
+
return false
|
33
|
+
else
|
34
|
+
# $text is valid
|
35
|
+
return true
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def split_sentences(array)
|
40
|
+
# rubocop: disable Layout/ArrayAlignment
|
41
|
+
tokenized = array
|
42
|
+
people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
|
43
|
+
supt det mssrs rev]
|
44
|
+
army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
|
45
|
+
inst = %w[dept univ assn bros ph.d]
|
46
|
+
place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
|
47
|
+
hwy hway la pde pd plz pl rd st tce]
|
48
|
+
comp = %w[mfg inc ltd co corp]
|
49
|
+
state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
|
50
|
+
ind ia kans kan ken ky la me md is mass mich minn miss mo mont
|
51
|
+
neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
|
52
|
+
va wash wis wisc wy wyo usafa alta man ont que sask yuk]
|
53
|
+
month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
|
54
|
+
misc = %w[vs etc no esp]
|
55
|
+
abbr = {}
|
56
|
+
[people, army, inst, place, comp, state, month, misc].flatten.each do |i|
|
57
|
+
abbr[i] = true
|
58
|
+
end
|
59
|
+
words = []
|
60
|
+
tokenized.each_with_index do |_t, i|
|
61
|
+
if tokenized[i + 1] &&
|
62
|
+
tokenized[i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
|
63
|
+
w = $1
|
64
|
+
# Don't separate the period off words that
|
65
|
+
# meet any of the following conditions:
|
66
|
+
#
|
67
|
+
# 1. It is defined in one of the lists above
|
68
|
+
# 2. It is only one letter long: Alfred E. Sloan
|
69
|
+
# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
|
70
|
+
unless abbr[w.downcase] ||
|
71
|
+
w =~ /\A[a-z]\z/i || w =~ /[a-z](?:\.[a-z])+\z/i
|
72
|
+
words << w
|
73
|
+
words << '.'
|
74
|
+
next
|
75
|
+
end
|
76
|
+
end
|
77
|
+
words << tokenized[i]
|
78
|
+
end
|
79
|
+
|
80
|
+
# If the final word ends in a period..
|
81
|
+
if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
|
82
|
+
words[-1] = $1
|
83
|
+
words.push '.'
|
84
|
+
end
|
85
|
+
words
|
86
|
+
end
|
87
|
+
# rubocop: enable Layout/ArrayAlignment
|
88
|
+
|
89
|
+
# Separate punctuation from words, where appropriate. This leaves trailing
|
90
|
+
# periods in place to be dealt with later. Called by the clean_text method.
|
91
|
+
def split_punct(text)
|
92
|
+
# If there's no punctuation, return immediately
|
93
|
+
return [text] if /\A\w+\z/ =~ text
|
94
|
+
|
95
|
+
# Sanity checks
|
96
|
+
text = text.gsub(/\W{10,}/o, ' ')
|
97
|
+
|
98
|
+
# Put quotes into a standard format
|
99
|
+
text = text.gsub(/`(?!`)(?=.*\w)/o, '` ') # Shift left quotes off text
|
100
|
+
text = text.gsub(/"(?=.*\w)/o, ' `` ') # Convert left quotes to ``
|
101
|
+
|
102
|
+
# Convert left quote to `
|
103
|
+
text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? "#{$1} ` " : ' ` ' }
|
104
|
+
text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
|
105
|
+
|
106
|
+
# Separate right single quotes
|
107
|
+
text = text.gsub(/(\w)'(?!')(?=\W|$)/o, "\\1 ' ")
|
108
|
+
|
109
|
+
# Handle all other punctuation
|
110
|
+
text = text.gsub(/--+/o, ' - ') # Convert and separate dashes
|
111
|
+
text = text.gsub(/,(?!\d)/o, ' , ') # Shift comma if not following by digit
|
112
|
+
text = text.gsub(/:/o, ' :') # Shift semicolon off
|
113
|
+
text = text.gsub(/(\.\.\.+)/o, ' \1 ') # Shift ellipses off
|
114
|
+
text = text.gsub(/([(\[{}\])])/o, ' \1 ') # Shift off brackets
|
115
|
+
|
116
|
+
# Shift off other ``standard'' punctuation
|
117
|
+
text = text.gsub(/([!?#$%;~|])/o, ' \1 ')
|
118
|
+
|
119
|
+
# English-specific contractions
|
120
|
+
# Separate off 'd 'm 's
|
121
|
+
text = text.gsub(/([A-Za-z])'([dms])\b/o, "\\1 '\\2")
|
122
|
+
text = text.gsub(/n't\b/o, " n't") # Separate off n't
|
123
|
+
text = text.gsub(/'(ve|ll|re)\b/o, " '\\1") # Separate off 've, 'll, 're
|
124
|
+
result = text.split
|
125
|
+
return result
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
# Instantiate a facade object as our Rley interface
|
130
|
+
nlp_engine = Rley::Engine.new
|
131
|
+
|
132
|
+
# Now build a very simplified English grammar...
|
133
|
+
nlp_engine.build_grammar do
|
134
|
+
# Terminals have same names as POS tags returned by Engtagger
|
135
|
+
add_terminals('NN', 'NNP')
|
136
|
+
add_terminals('DET', 'IN', 'VBD')
|
137
|
+
|
138
|
+
# Here we define the productions (= grammar rules)
|
139
|
+
rule 'S' => %w[NP VP]
|
140
|
+
rule 'NP' => 'NNP'
|
141
|
+
rule 'NP' => %w[DET NN]
|
142
|
+
rule 'NP' => %w[DET NN PP]
|
143
|
+
rule 'VP' => %w[VBD NP]
|
144
|
+
rule 'VP' => %w[VBD NP PP]
|
145
|
+
rule 'PP' => %w[IN NP]
|
146
|
+
end
|
147
|
+
|
148
|
+
# text = "Yo I'm not done with you"
|
149
|
+
text = 'John saw Mary with a telescope'
|
150
|
+
puts "Input text --> #{text}"
|
151
|
+
|
152
|
+
tgr = EngTagger.new
|
153
|
+
|
154
|
+
# Generate raw POS output
|
155
|
+
tagged = tgr.add_tags(text)
|
156
|
+
|
157
|
+
# Generte tokenied lexicon of input text
|
158
|
+
# Instead of creating a lexicon dictionary,
|
159
|
+
# we would simply generate one each time on the fly for the current text only.
|
160
|
+
lexicon = clean_text(text)
|
161
|
+
|
162
|
+
# Convert EngTagger POS tokens in [[word, pos], ..] format
|
163
|
+
tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
|
164
|
+
|
165
|
+
def tokenizer(lexicon, tokens)
|
166
|
+
pos = -1
|
167
|
+
rley_tokens = []
|
168
|
+
lexicon.each_with_index do |word, i|
|
169
|
+
term_name = tokens[i].last
|
170
|
+
rank = Rley::Lexical::Position.new(1, pos + 1)
|
171
|
+
pos += word.length + 1 # Assuming one space between words.
|
172
|
+
rley_tokens << Rley::Lexical::Token.new(word, term_name, rank)
|
173
|
+
end
|
174
|
+
return rley_tokens
|
175
|
+
end
|
176
|
+
|
177
|
+
# Convert input text into a sequence of rley token objects...
|
178
|
+
rley_tokens = tokenizer(lexicon, tokens)
|
179
|
+
|
180
|
+
# Let Rley grok the tokens
|
181
|
+
result = nlp_engine.parse(rley_tokens)
|
182
|
+
|
183
|
+
puts "Parsing successful? #{result.success?}" # => Parsing successful? true
|
184
|
+
puts result.failure_reason.message unless result.success?
|
185
|
+
|
186
|
+
ptree = nlp_engine.convert(result)
|
187
|
+
|
188
|
+
visitor = nlp_engine.ptree_visitor(ptree)
|
189
|
+
|
190
|
+
renderer = Rley::Formatter::Asciitree.new($stdout)
|
191
|
+
|
192
|
+
# Let's visualize the parse tree (in text format...)
|
193
|
+
puts renderer.render(visitor)
|