rley 0.7.07 → 0.7.08

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +348 -54
  3. data/LICENSE.txt +1 -1
  4. data/README.md +3 -2
  5. data/examples/NLP/engtagger.rb +193 -190
  6. data/examples/NLP/nano_eng/nano_grammar.rb +5 -5
  7. data/examples/data_formats/JSON/cli_options.rb +1 -1
  8. data/examples/data_formats/JSON/json_ast_builder.rb +12 -9
  9. data/examples/data_formats/JSON/json_ast_nodes.rb +12 -21
  10. data/examples/data_formats/JSON/json_grammar.rb +2 -2
  11. data/examples/data_formats/JSON/json_lexer.rb +8 -8
  12. data/examples/data_formats/JSON/json_minifier.rb +1 -1
  13. data/examples/general/calc_iter1/calc_ast_builder.rb +13 -10
  14. data/examples/general/calc_iter1/calc_ast_nodes.rb +23 -37
  15. data/examples/general/calc_iter1/calc_grammar.rb +2 -2
  16. data/examples/general/calc_iter1/calc_lexer.rb +6 -4
  17. data/examples/general/calc_iter1/spec/calculator_spec.rb +5 -5
  18. data/examples/general/calc_iter2/calc_ast_builder.rb +5 -3
  19. data/examples/general/calc_iter2/calc_ast_nodes.rb +27 -43
  20. data/examples/general/calc_iter2/calc_grammar.rb +3 -3
  21. data/examples/general/calc_iter2/calc_lexer.rb +11 -10
  22. data/examples/general/calc_iter2/spec/calculator_spec.rb +26 -26
  23. data/examples/general/left.rb +2 -2
  24. data/examples/general/right.rb +2 -2
  25. data/lib/rley/base/dotted_item.rb +23 -31
  26. data/lib/rley/constants.rb +2 -2
  27. data/lib/rley/engine.rb +20 -23
  28. data/lib/rley/formatter/asciitree.rb +3 -3
  29. data/lib/rley/formatter/bracket_notation.rb +1 -8
  30. data/lib/rley/formatter/debug.rb +6 -6
  31. data/lib/rley/formatter/json.rb +2 -2
  32. data/lib/rley/gfg/call_edge.rb +1 -1
  33. data/lib/rley/gfg/edge.rb +5 -5
  34. data/lib/rley/gfg/end_vertex.rb +2 -6
  35. data/lib/rley/gfg/epsilon_edge.rb +1 -5
  36. data/lib/rley/gfg/grm_flow_graph.rb +27 -23
  37. data/lib/rley/gfg/item_vertex.rb +10 -10
  38. data/lib/rley/gfg/non_terminal_vertex.rb +4 -4
  39. data/lib/rley/gfg/scan_edge.rb +1 -1
  40. data/lib/rley/gfg/shortcut_edge.rb +2 -2
  41. data/lib/rley/gfg/start_vertex.rb +4 -8
  42. data/lib/rley/gfg/vertex.rb +43 -39
  43. data/lib/rley/lexical/token_range.rb +6 -6
  44. data/lib/rley/parse_forest_visitor.rb +5 -5
  45. data/lib/rley/parse_rep/ast_base_builder.rb +9 -11
  46. data/lib/rley/parse_rep/cst_builder.rb +5 -6
  47. data/lib/rley/parse_rep/parse_forest_builder.rb +20 -18
  48. data/lib/rley/parse_rep/parse_forest_factory.rb +3 -3
  49. data/lib/rley/parse_rep/parse_rep_creator.rb +11 -13
  50. data/lib/rley/parse_rep/parse_tree_builder.rb +4 -4
  51. data/lib/rley/parse_rep/parse_tree_factory.rb +27 -27
  52. data/lib/rley/parse_tree_visitor.rb +1 -1
  53. data/lib/rley/parser/error_reason.rb +4 -5
  54. data/lib/rley/parser/gfg_chart.rb +20 -22
  55. data/lib/rley/parser/gfg_parsing.rb +16 -30
  56. data/lib/rley/parser/parse_entry.rb +25 -31
  57. data/lib/rley/parser/parse_entry_set.rb +18 -15
  58. data/lib/rley/parser/parse_entry_tracker.rb +4 -4
  59. data/lib/rley/parser/parse_state.rb +16 -21
  60. data/lib/rley/parser/parse_state_tracker.rb +4 -4
  61. data/lib/rley/parser/parse_tracer.rb +13 -13
  62. data/lib/rley/parser/parse_walker_factory.rb +23 -28
  63. data/lib/rley/parser/state_set.rb +9 -10
  64. data/lib/rley/ptree/non_terminal_node.rb +7 -5
  65. data/lib/rley/ptree/parse_tree.rb +3 -3
  66. data/lib/rley/ptree/parse_tree_node.rb +5 -5
  67. data/lib/rley/ptree/terminal_node.rb +7 -7
  68. data/lib/rley/rley_error.rb +12 -12
  69. data/lib/rley/sppf/alternative_node.rb +6 -6
  70. data/lib/rley/sppf/composite_node.rb +7 -7
  71. data/lib/rley/sppf/epsilon_node.rb +3 -3
  72. data/lib/rley/sppf/leaf_node.rb +3 -3
  73. data/lib/rley/sppf/parse_forest.rb +16 -16
  74. data/lib/rley/sppf/sppf_node.rb +7 -8
  75. data/lib/rley/sppf/token_node.rb +3 -3
  76. data/lib/rley/syntax/grammar.rb +5 -5
  77. data/lib/rley/syntax/grammar_builder.rb +9 -9
  78. data/lib/rley/syntax/grm_symbol.rb +6 -6
  79. data/lib/rley/syntax/non_terminal.rb +9 -15
  80. data/lib/rley/syntax/production.rb +10 -10
  81. data/lib/rley/syntax/symbol_seq.rb +7 -9
  82. data/lib/rley/syntax/terminal.rb +4 -5
  83. data/lib/rley/syntax/verbatim_symbol.rb +3 -3
  84. data/lib/support/base_tokenizer.rb +19 -18
  85. data/spec/rley/base/dotted_item_spec.rb +2 -2
  86. data/spec/rley/engine_spec.rb +17 -15
  87. data/spec/rley/formatter/asciitree_spec.rb +7 -7
  88. data/spec/rley/formatter/bracket_notation_spec.rb +13 -13
  89. data/spec/rley/formatter/json_spec.rb +1 -1
  90. data/spec/rley/gfg/end_vertex_spec.rb +5 -5
  91. data/spec/rley/gfg/item_vertex_spec.rb +10 -10
  92. data/spec/rley/gfg/non_terminal_vertex_spec.rb +3 -3
  93. data/spec/rley/gfg/shortcut_edge_spec.rb +1 -1
  94. data/spec/rley/gfg/start_vertex_spec.rb +5 -5
  95. data/spec/rley/gfg/vertex_spec.rb +3 -3
  96. data/spec/rley/lexical/token_range_spec.rb +16 -16
  97. data/spec/rley/lexical/token_spec.rb +2 -2
  98. data/spec/rley/parse_forest_visitor_spec.rb +165 -163
  99. data/spec/rley/parse_rep/ambiguous_parse_spec.rb +44 -44
  100. data/spec/rley/parse_rep/ast_builder_spec.rb +6 -6
  101. data/spec/rley/parse_rep/cst_builder_spec.rb +5 -5
  102. data/spec/rley/parse_rep/groucho_spec.rb +21 -21
  103. data/spec/rley/parse_rep/parse_forest_builder_spec.rb +26 -26
  104. data/spec/rley/parse_rep/parse_forest_factory_spec.rb +6 -6
  105. data/spec/rley/parse_rep/parse_tree_factory_spec.rb +2 -2
  106. data/spec/rley/parse_tree_visitor_spec.rb +10 -8
  107. data/spec/rley/parser/error_reason_spec.rb +6 -6
  108. data/spec/rley/parser/gfg_earley_parser_spec.rb +4 -2
  109. data/spec/rley/parser/gfg_parsing_spec.rb +4 -8
  110. data/spec/rley/parser/parse_entry_spec.rb +19 -19
  111. data/spec/rley/parser/parse_state_spec.rb +5 -5
  112. data/spec/rley/parser/parse_walker_factory_spec.rb +1 -1
  113. data/spec/rley/parser/state_set_spec.rb +22 -22
  114. data/spec/rley/ptree/non_terminal_node_spec.rb +5 -3
  115. data/spec/rley/ptree/parse_tree_node_spec.rb +4 -4
  116. data/spec/rley/ptree/terminal_node_spec.rb +6 -6
  117. data/spec/rley/sppf/alternative_node_spec.rb +6 -6
  118. data/spec/rley/sppf/non_terminal_node_spec.rb +3 -3
  119. data/spec/rley/sppf/token_node_spec.rb +4 -4
  120. data/spec/rley/support/ambiguous_grammar_helper.rb +3 -4
  121. data/spec/rley/support/grammar_abc_helper.rb +2 -4
  122. data/spec/rley/support/grammar_ambig01_helper.rb +4 -5
  123. data/spec/rley/support/grammar_arr_int_helper.rb +4 -5
  124. data/spec/rley/support/grammar_b_expr_helper.rb +4 -5
  125. data/spec/rley/support/grammar_l0_helper.rb +10 -11
  126. data/spec/rley/support/grammar_pb_helper.rb +6 -5
  127. data/spec/rley/support/grammar_sppf_helper.rb +1 -1
  128. data/spec/rley/syntax/grammar_builder_spec.rb +5 -5
  129. data/spec/rley/syntax/grammar_spec.rb +6 -6
  130. data/spec/rley/syntax/grm_symbol_spec.rb +1 -1
  131. data/spec/rley/syntax/non_terminal_spec.rb +8 -8
  132. data/spec/rley/syntax/production_spec.rb +13 -13
  133. data/spec/rley/syntax/symbol_seq_spec.rb +2 -2
  134. data/spec/rley/syntax/terminal_spec.rb +5 -5
  135. data/spec/rley/syntax/verbatim_symbol_spec.rb +1 -1
  136. data/spec/spec_helper.rb +0 -12
  137. data/spec/support/base_tokenizer_spec.rb +7 -2
  138. metadata +21 -62
  139. data/.simplecov +0 -8
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2014-2019 Dimitri Geshef
1
+ Copyright (c) 2014-2021 Dimitri Geshef
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining a copy
4
4
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -90,7 +90,7 @@ directory
90
90
 
91
91
  # Let's create a facade object called 'engine'
92
92
  # It provides a unified, higher-level interface
93
- engine = Rley.Engine.new
93
+ engine = Rley::Engine.new
94
94
  ```
95
95
 
96
96
 
@@ -446,6 +446,7 @@ actively curated by Andrei Beliankou (aka arbox).
446
446
  ## Thanks to:
447
447
  * Professor Keshav Pingali, one of the creators of the Grammar Flow Graph parsing approach for his encouraging e-mail exchange.
448
448
  * [Arjun Menon](https://github.com/arjunmenon) for his NLP example that uses `engtagger` gem.
449
+ * [Gui Heurich](https://github.com/GuiHeurich) for spotting a mistake in the code sample in `README` file.
449
450
 
450
451
  ## Grammar Flow Graph
451
452
  Since the Grammar Flow Graph parsing approach is quite new, it has not yet taken a place in
@@ -458,5 +459,5 @@ standard parser textbooks. Here are a few references (and links) of papers on GF
458
459
 
459
460
  Copyright
460
461
  ---------
461
- Copyright (c) 2014-2018, Dimitri Geshef.
462
+ Copyright (c) 2014-2020, Dimitri Geshef.
462
463
  __Rley__ is released under the MIT License see [LICENSE.txt](https://github.com/famished-tiger/Rley/blob/master/LICENSE.txt) for details.
@@ -1,190 +1,193 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rley'
4
- require 'engtagger' # Load POS (Part-Of-Speech) tagger EngTagger
5
-
6
- # REGEX to remove XML tags from Engtagger output
7
- GET_TAG = /<(.+?)>(.*?)<.+?>/.freeze
8
-
9
- # Text tokenizer
10
- # Taken directly from Engtagger, will ensure uniform indexing while parsing
11
- def clean_text(text)
12
- return false unless valid_text(text)
13
-
14
- text = text.toutf8
15
- cleaned_text = text
16
- tokenized = []
17
- # Tokenize the text (splitting on punctuation as you go)
18
- cleaned_text.split(/\s+/).each do |line|
19
- tokenized += split_punct(line)
20
- end
21
- words = split_sentences(tokenized)
22
- return words
23
- end
24
-
25
- def valid_text(text)
26
- if !text
27
- # there's nothing to parse
28
- puts 'method call on uninitialized variable'
29
- return false
30
- elsif /\A\s*\z/ =~ text
31
- # text is an empty string, nothing to parse
32
- return false
33
- else
34
- # $text is valid
35
- return true
36
- end
37
- end
38
-
39
- def split_sentences(array)
40
- tokenized = array
41
- people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
42
- supt det mssrs rev]
43
- army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
44
- inst = %w[dept univ assn bros ph.d]
45
- place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
46
- hwy hway la pde pd plz pl rd st tce]
47
- comp = %w[mfg inc ltd co corp]
48
- state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
49
- ind ia kans kan ken ky la me md is mass mich minn miss mo mont
50
- neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
51
- va wash wis wisc wy wyo usafa alta man ont que sask yuk]
52
- month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
53
- misc = %w[vs etc no esp]
54
- abbr = {}
55
- [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
56
- abbr[i] = true
57
- end
58
- words = []
59
- tokenized.each_with_index do |_t, i|
60
- if tokenized[i + 1] &&
61
- tokenized [i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
62
- w = $1
63
- # Don't separate the period off words that
64
- # meet any of the following conditions:
65
- #
66
- # 1. It is defined in one of the lists above
67
- # 2. It is only one letter long: Alfred E. Sloan
68
- # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
69
- unless abbr[w.downcase] ||
70
- w =~ /\A[a-z]\z/i || w =~ /[a-z](?:\.[a-z])+\z/i
71
- words << w
72
- words << '.'
73
- next
74
- end
75
- end
76
- words << tokenized[i]
77
- end
78
- # If the final word ends in a period..
79
- if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
80
- words[-1] = $1
81
- words.push '.'
82
- end
83
- return words
84
- end
85
-
86
- # Separate punctuation from words, where appropriate. This leaves trailing
87
- # periods in place to be dealt with later. Called by the clean_text method.
88
- def split_punct(text)
89
- # If there's no punctuation, return immediately
90
- return [text] if /\A\w+\z/ =~ text
91
-
92
- # Sanity checks
93
- text = text.gsub(/\W{10,}/o, ' ')
94
-
95
- # Put quotes into a standard format
96
- text = text.gsub(/`(?!`)(?=.*\w)/o, '` ') # Shift left quotes off text
97
- text = text.gsub(/"(?=.*\w)/o, ' `` ') # Convert left quotes to ``
98
-
99
- # Convert left quote to `
100
- text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? $1 + ' ` ' : ' ` ' }
101
- text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
102
-
103
- # Separate right single quotes
104
- text = text.gsub(/(\w)'(?!')(?=\W|$)/o, "\\1 ' ")
105
-
106
- # Handle all other punctuation
107
- text = text.gsub(/--+/o, ' - ') # Convert and separate dashes
108
- text = text.gsub(/,(?!\d)/o, ' , ') # Shift comma if not following by digit
109
- text = text.gsub(/:/o, ' :') # Shift semicolon off
110
- text = text.gsub(/(\.\.\.+)/o, ' \1 ') # Shift ellipses off
111
- text = text.gsub(/([\(\[\{\}\]\)])/o, ' \1 ') # Shift off brackets
112
-
113
- # Shift off other ``standard'' punctuation
114
- text = text.gsub(/([\!\?#\$%;~|])/o, ' \1 ')
115
-
116
- # English-specific contractions
117
- # Separate off 'd 'm 's
118
- text = text.gsub(/([A-Za-z])'([dms])\b/o, "\\1 '\\2")
119
- text = text.gsub(/n't\b/o, " n't") # Separate off n't
120
- text = text.gsub(/'(ve|ll|re)\b/o, " '\\1") # Separate off 've, 'll, 're
121
- result = text.split(' ')
122
- return result
123
- end
124
-
125
-
126
- # Instantiate a facade object as our Rley interface
127
- nlp_engine = Rley::Engine.new
128
-
129
- # Now build a very simplified English grammar...
130
- nlp_engine.build_grammar do
131
- # Terminals have same names as POS tags returned by Engtagger
132
- add_terminals('NN', 'NNP')
133
- add_terminals('DET', 'IN', 'VBD')
134
-
135
- # Here we define the productions (= grammar rules)
136
- rule 'S' => %w[NP VP]
137
- rule 'NP' => 'NNP'
138
- rule 'NP' => %w[DET NN]
139
- rule 'NP' => %w[DET NN PP]
140
- rule 'VP' => %w[VBD NP]
141
- rule 'VP' => %w[VBD NP PP]
142
- rule 'PP' => %w[IN NP]
143
- end
144
-
145
- # text = "Yo I'm not done with you"
146
- text = 'John saw Mary with a telescope'
147
- puts "Input text --> #{text}"
148
-
149
- tgr = EngTagger.new
150
-
151
- # Generate raw POS output
152
- tagged = tgr.add_tags(text)
153
-
154
- # Generte tokenied lexicon of input text
155
- # Instead of creating a lexicon dictionary,
156
- # we would simply generate one each time on the fly for the current text only.
157
- lexicon = clean_text(text)
158
-
159
- # Convert EngTagger POS tokens in [[word, pos], ..] format
160
- tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
161
-
162
- def tokenizer(lexicon, tokens)
163
- pos = -1
164
- rley_tokens = []
165
- lexicon.each_with_index do |word, i|
166
- term_name = tokens[i].last
167
- rank = Rley::Lexical::Position.new(1, pos + 1)
168
- pos += word.length + 1 # Assuming one space between words.
169
- rley_tokens << Rley::Lexical::Token.new(word, term_name, rank)
170
- end
171
- return rley_tokens
172
- end
173
-
174
- # Convert input text into a sequence of rley token objects...
175
- rley_tokens = tokenizer(lexicon, tokens)
176
-
177
- # Let Rley grok the tokens
178
- result = nlp_engine.parse(rley_tokens)
179
-
180
- puts "Parsing successful? #{result.success?}" # => Parsing successful? true
181
- puts result.failure_reason.message unless result.success?
182
-
183
- ptree = nlp_engine.convert(result)
184
-
185
- visitor = nlp_engine.ptree_visitor(ptree)
186
-
187
- renderer = Rley::Formatter::Asciitree.new($stdout)
188
-
189
- # Let's visualize the parse tree (in text format...)
190
- puts renderer.render(visitor)
1
+ # frozen_string_literal: true
2
+
3
+ require 'rley'
4
+ require 'engtagger' # Load POS (Part-Of-Speech) tagger EngTagger
5
+
6
+ # REGEX to remove XML tags from Engtagger output
7
+ GET_TAG = /<(.+?)>(.*?)<.+?>/.freeze
8
+
9
+ # Text tokenizer
10
+ # Taken directly from Engtagger, will ensure uniform indexing while parsing
11
+ def clean_text(text)
12
+ return false unless valid_text(text)
13
+
14
+ text = text.toutf8
15
+ cleaned_text = text
16
+ tokenized = []
17
+ # Tokenize the text (splitting on punctuation as you go)
18
+ cleaned_text.split(/\s+/).each do |line|
19
+ tokenized += split_punct(line)
20
+ end
21
+ words = split_sentences(tokenized)
22
+ return words
23
+ end
24
+
25
+ def valid_text(text)
26
+ if !text
27
+ # there's nothing to parse
28
+ puts 'method call on uninitialized variable'
29
+ return false
30
+ elsif /\A\s*\z/ =~ text
31
+ # text is an empty string, nothing to parse
32
+ return false
33
+ else
34
+ # $text is valid
35
+ return true
36
+ end
37
+ end
38
+
39
+ def split_sentences(array)
40
+ # rubocop: disable Layout/ArrayAlignment
41
+ tokenized = array
42
+ people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
43
+ supt det mssrs rev]
44
+ army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
45
+ inst = %w[dept univ assn bros ph.d]
46
+ place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
47
+ hwy hway la pde pd plz pl rd st tce]
48
+ comp = %w[mfg inc ltd co corp]
49
+ state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
50
+ ind ia kans kan ken ky la me md is mass mich minn miss mo mont
51
+ neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
52
+ va wash wis wisc wy wyo usafa alta man ont que sask yuk]
53
+ month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
54
+ misc = %w[vs etc no esp]
55
+ abbr = {}
56
+ [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
57
+ abbr[i] = true
58
+ end
59
+ words = []
60
+ tokenized.each_with_index do |_t, i|
61
+ if tokenized[i + 1] &&
62
+ tokenized[i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
63
+ w = $1
64
+ # Don't separate the period off words that
65
+ # meet any of the following conditions:
66
+ #
67
+ # 1. It is defined in one of the lists above
68
+ # 2. It is only one letter long: Alfred E. Sloan
69
+ # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
70
+ unless abbr[w.downcase] ||
71
+ w =~ /\A[a-z]\z/i || w =~ /[a-z](?:\.[a-z])+\z/i
72
+ words << w
73
+ words << '.'
74
+ next
75
+ end
76
+ end
77
+ words << tokenized[i]
78
+ end
79
+
80
+ # If the final word ends in a period..
81
+ if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
82
+ words[-1] = $1
83
+ words.push '.'
84
+ end
85
+ words
86
+ end
87
+ # rubocop: enable Layout/ArrayAlignment
88
+
89
+ # Separate punctuation from words, where appropriate. This leaves trailing
90
+ # periods in place to be dealt with later. Called by the clean_text method.
91
+ def split_punct(text)
92
+ # If there's no punctuation, return immediately
93
+ return [text] if /\A\w+\z/ =~ text
94
+
95
+ # Sanity checks
96
+ text = text.gsub(/\W{10,}/o, ' ')
97
+
98
+ # Put quotes into a standard format
99
+ text = text.gsub(/`(?!`)(?=.*\w)/o, '` ') # Shift left quotes off text
100
+ text = text.gsub(/"(?=.*\w)/o, ' `` ') # Convert left quotes to ``
101
+
102
+ # Convert left quote to `
103
+ text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? "#{$1} ` " : ' ` ' }
104
+ text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
105
+
106
+ # Separate right single quotes
107
+ text = text.gsub(/(\w)'(?!')(?=\W|$)/o, "\\1 ' ")
108
+
109
+ # Handle all other punctuation
110
+ text = text.gsub(/--+/o, ' - ') # Convert and separate dashes
111
+ text = text.gsub(/,(?!\d)/o, ' , ') # Shift comma if not following by digit
112
+ text = text.gsub(/:/o, ' :') # Shift semicolon off
113
+ text = text.gsub(/(\.\.\.+)/o, ' \1 ') # Shift ellipses off
114
+ text = text.gsub(/([(\[{}\])])/o, ' \1 ') # Shift off brackets
115
+
116
+ # Shift off other ``standard'' punctuation
117
+ text = text.gsub(/([!?#$%;~|])/o, ' \1 ')
118
+
119
+ # English-specific contractions
120
+ # Separate off 'd 'm 's
121
+ text = text.gsub(/([A-Za-z])'([dms])\b/o, "\\1 '\\2")
122
+ text = text.gsub(/n't\b/o, " n't") # Separate off n't
123
+ text = text.gsub(/'(ve|ll|re)\b/o, " '\\1") # Separate off 've, 'll, 're
124
+ result = text.split
125
+ return result
126
+ end
127
+
128
+
129
+ # Instantiate a facade object as our Rley interface
130
+ nlp_engine = Rley::Engine.new
131
+
132
+ # Now build a very simplified English grammar...
133
+ nlp_engine.build_grammar do
134
+ # Terminals have same names as POS tags returned by Engtagger
135
+ add_terminals('NN', 'NNP')
136
+ add_terminals('DET', 'IN', 'VBD')
137
+
138
+ # Here we define the productions (= grammar rules)
139
+ rule 'S' => %w[NP VP]
140
+ rule 'NP' => 'NNP'
141
+ rule 'NP' => %w[DET NN]
142
+ rule 'NP' => %w[DET NN PP]
143
+ rule 'VP' => %w[VBD NP]
144
+ rule 'VP' => %w[VBD NP PP]
145
+ rule 'PP' => %w[IN NP]
146
+ end
147
+
148
+ # text = "Yo I'm not done with you"
149
+ text = 'John saw Mary with a telescope'
150
+ puts "Input text --> #{text}"
151
+
152
+ tgr = EngTagger.new
153
+
154
+ # Generate raw POS output
155
+ tagged = tgr.add_tags(text)
156
+
157
+ # Generte tokenied lexicon of input text
158
+ # Instead of creating a lexicon dictionary,
159
+ # we would simply generate one each time on the fly for the current text only.
160
+ lexicon = clean_text(text)
161
+
162
+ # Convert EngTagger POS tokens in [[word, pos], ..] format
163
+ tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
164
+
165
+ def tokenizer(lexicon, tokens)
166
+ pos = -1
167
+ rley_tokens = []
168
+ lexicon.each_with_index do |word, i|
169
+ term_name = tokens[i].last
170
+ rank = Rley::Lexical::Position.new(1, pos + 1)
171
+ pos += word.length + 1 # Assuming one space between words.
172
+ rley_tokens << Rley::Lexical::Token.new(word, term_name, rank)
173
+ end
174
+ return rley_tokens
175
+ end
176
+
177
+ # Convert input text into a sequence of rley token objects...
178
+ rley_tokens = tokenizer(lexicon, tokens)
179
+
180
+ # Let Rley grok the tokens
181
+ result = nlp_engine.parse(rley_tokens)
182
+
183
+ puts "Parsing successful? #{result.success?}" # => Parsing successful? true
184
+ puts result.failure_reason.message unless result.success?
185
+
186
+ ptree = nlp_engine.convert(result)
187
+
188
+ visitor = nlp_engine.ptree_visitor(ptree)
189
+
190
+ renderer = Rley::Formatter::Asciitree.new($stdout)
191
+
192
+ # Let's visualize the parse tree (in text format...)
193
+ puts renderer.render(visitor)
@@ -11,7 +11,7 @@ require 'rley' # Load the gem
11
11
  # based on chapter 12 from Jurafski & Martin book.
12
12
  # Daniel Jurafsky,‎ James H. Martin: "Speech and Language Processing";
13
13
  # 2009, Pearson Education, Inc., ISBN 978-0135041963
14
- # It defines the syntax of a sentence in a mini English-like language
14
+ # It defines the syntax of a sentence in a mini English-like language
15
15
  builder = Rley::Syntax::GrammarBuilder.new do
16
16
  add_terminals('Pronoun', 'Proper-Noun')
17
17
  add_terminals('Determiner', 'Noun')
@@ -21,7 +21,7 @@ builder = Rley::Syntax::GrammarBuilder.new do
21
21
 
22
22
  rule 'language' => 'sentence'
23
23
  rule 'sentence' => 'declarative'
24
- rule 'sentence' => 'imperative'
24
+ rule 'sentence' => 'imperative'
25
25
  rule 'sentence' => 'yes_no_question'
26
26
  rule 'sentence' => 'wh_subject_question'
27
27
  rule 'sentence' => 'wh_non_subject_question'
@@ -33,7 +33,7 @@ builder = Rley::Syntax::GrammarBuilder.new do
33
33
  rule 'NP' => %w[Predeterminer NP]
34
34
  rule 'NP' => 'Pronoun'
35
35
  rule 'NP' => 'Proper-Noun'
36
- rule 'NP' => %w[Det Card Ord Quant Nominal]
36
+ rule 'NP' => %w[Det Card Ord Quant Nominal]
37
37
  rule 'VP' => 'Verb'
38
38
  rule 'VP' => %w[Verb NP]
39
39
  rule 'VP' => %w[Verb NP PP]
@@ -43,12 +43,12 @@ builder = Rley::Syntax::GrammarBuilder.new do
43
43
  rule 'Card' => 'Cardinal_number'
44
44
  rule 'Card' => []
45
45
  rule 'Ord' => 'Ordinal_number'
46
- rule 'Ord' => []
46
+ rule 'Ord' => []
47
47
  rule 'Nominal' => 'Noun'
48
48
  rule 'Nominal' => %w[Nominal Noun]
49
49
  rule 'Nominal' => %w[Nominal GerundVP]
50
50
  rule 'Nominal' => %w[Nominal RelClause]
51
- rule 'PP' => %w[Preposition NP]
51
+ rule 'PP' => %w[Preposition NP]
52
52
  rule 'GerundVP' => 'GerundV'
53
53
  rule 'GerundVP' => %w[GerundV NP]
54
54
  rule 'GerundVP' => %w[GerundV NP PP]