rley 0.7.07 → 0.7.08

Sign up to get free protection for your applications and to get access to all the features.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +348 -54
  3. data/LICENSE.txt +1 -1
  4. data/README.md +3 -2
  5. data/examples/NLP/engtagger.rb +193 -190
  6. data/examples/NLP/nano_eng/nano_grammar.rb +5 -5
  7. data/examples/data_formats/JSON/cli_options.rb +1 -1
  8. data/examples/data_formats/JSON/json_ast_builder.rb +12 -9
  9. data/examples/data_formats/JSON/json_ast_nodes.rb +12 -21
  10. data/examples/data_formats/JSON/json_grammar.rb +2 -2
  11. data/examples/data_formats/JSON/json_lexer.rb +8 -8
  12. data/examples/data_formats/JSON/json_minifier.rb +1 -1
  13. data/examples/general/calc_iter1/calc_ast_builder.rb +13 -10
  14. data/examples/general/calc_iter1/calc_ast_nodes.rb +23 -37
  15. data/examples/general/calc_iter1/calc_grammar.rb +2 -2
  16. data/examples/general/calc_iter1/calc_lexer.rb +6 -4
  17. data/examples/general/calc_iter1/spec/calculator_spec.rb +5 -5
  18. data/examples/general/calc_iter2/calc_ast_builder.rb +5 -3
  19. data/examples/general/calc_iter2/calc_ast_nodes.rb +27 -43
  20. data/examples/general/calc_iter2/calc_grammar.rb +3 -3
  21. data/examples/general/calc_iter2/calc_lexer.rb +11 -10
  22. data/examples/general/calc_iter2/spec/calculator_spec.rb +26 -26
  23. data/examples/general/left.rb +2 -2
  24. data/examples/general/right.rb +2 -2
  25. data/lib/rley/base/dotted_item.rb +23 -31
  26. data/lib/rley/constants.rb +2 -2
  27. data/lib/rley/engine.rb +20 -23
  28. data/lib/rley/formatter/asciitree.rb +3 -3
  29. data/lib/rley/formatter/bracket_notation.rb +1 -8
  30. data/lib/rley/formatter/debug.rb +6 -6
  31. data/lib/rley/formatter/json.rb +2 -2
  32. data/lib/rley/gfg/call_edge.rb +1 -1
  33. data/lib/rley/gfg/edge.rb +5 -5
  34. data/lib/rley/gfg/end_vertex.rb +2 -6
  35. data/lib/rley/gfg/epsilon_edge.rb +1 -5
  36. data/lib/rley/gfg/grm_flow_graph.rb +27 -23
  37. data/lib/rley/gfg/item_vertex.rb +10 -10
  38. data/lib/rley/gfg/non_terminal_vertex.rb +4 -4
  39. data/lib/rley/gfg/scan_edge.rb +1 -1
  40. data/lib/rley/gfg/shortcut_edge.rb +2 -2
  41. data/lib/rley/gfg/start_vertex.rb +4 -8
  42. data/lib/rley/gfg/vertex.rb +43 -39
  43. data/lib/rley/lexical/token_range.rb +6 -6
  44. data/lib/rley/parse_forest_visitor.rb +5 -5
  45. data/lib/rley/parse_rep/ast_base_builder.rb +9 -11
  46. data/lib/rley/parse_rep/cst_builder.rb +5 -6
  47. data/lib/rley/parse_rep/parse_forest_builder.rb +20 -18
  48. data/lib/rley/parse_rep/parse_forest_factory.rb +3 -3
  49. data/lib/rley/parse_rep/parse_rep_creator.rb +11 -13
  50. data/lib/rley/parse_rep/parse_tree_builder.rb +4 -4
  51. data/lib/rley/parse_rep/parse_tree_factory.rb +27 -27
  52. data/lib/rley/parse_tree_visitor.rb +1 -1
  53. data/lib/rley/parser/error_reason.rb +4 -5
  54. data/lib/rley/parser/gfg_chart.rb +20 -22
  55. data/lib/rley/parser/gfg_parsing.rb +16 -30
  56. data/lib/rley/parser/parse_entry.rb +25 -31
  57. data/lib/rley/parser/parse_entry_set.rb +18 -15
  58. data/lib/rley/parser/parse_entry_tracker.rb +4 -4
  59. data/lib/rley/parser/parse_state.rb +16 -21
  60. data/lib/rley/parser/parse_state_tracker.rb +4 -4
  61. data/lib/rley/parser/parse_tracer.rb +13 -13
  62. data/lib/rley/parser/parse_walker_factory.rb +23 -28
  63. data/lib/rley/parser/state_set.rb +9 -10
  64. data/lib/rley/ptree/non_terminal_node.rb +7 -5
  65. data/lib/rley/ptree/parse_tree.rb +3 -3
  66. data/lib/rley/ptree/parse_tree_node.rb +5 -5
  67. data/lib/rley/ptree/terminal_node.rb +7 -7
  68. data/lib/rley/rley_error.rb +12 -12
  69. data/lib/rley/sppf/alternative_node.rb +6 -6
  70. data/lib/rley/sppf/composite_node.rb +7 -7
  71. data/lib/rley/sppf/epsilon_node.rb +3 -3
  72. data/lib/rley/sppf/leaf_node.rb +3 -3
  73. data/lib/rley/sppf/parse_forest.rb +16 -16
  74. data/lib/rley/sppf/sppf_node.rb +7 -8
  75. data/lib/rley/sppf/token_node.rb +3 -3
  76. data/lib/rley/syntax/grammar.rb +5 -5
  77. data/lib/rley/syntax/grammar_builder.rb +9 -9
  78. data/lib/rley/syntax/grm_symbol.rb +6 -6
  79. data/lib/rley/syntax/non_terminal.rb +9 -15
  80. data/lib/rley/syntax/production.rb +10 -10
  81. data/lib/rley/syntax/symbol_seq.rb +7 -9
  82. data/lib/rley/syntax/terminal.rb +4 -5
  83. data/lib/rley/syntax/verbatim_symbol.rb +3 -3
  84. data/lib/support/base_tokenizer.rb +19 -18
  85. data/spec/rley/base/dotted_item_spec.rb +2 -2
  86. data/spec/rley/engine_spec.rb +17 -15
  87. data/spec/rley/formatter/asciitree_spec.rb +7 -7
  88. data/spec/rley/formatter/bracket_notation_spec.rb +13 -13
  89. data/spec/rley/formatter/json_spec.rb +1 -1
  90. data/spec/rley/gfg/end_vertex_spec.rb +5 -5
  91. data/spec/rley/gfg/item_vertex_spec.rb +10 -10
  92. data/spec/rley/gfg/non_terminal_vertex_spec.rb +3 -3
  93. data/spec/rley/gfg/shortcut_edge_spec.rb +1 -1
  94. data/spec/rley/gfg/start_vertex_spec.rb +5 -5
  95. data/spec/rley/gfg/vertex_spec.rb +3 -3
  96. data/spec/rley/lexical/token_range_spec.rb +16 -16
  97. data/spec/rley/lexical/token_spec.rb +2 -2
  98. data/spec/rley/parse_forest_visitor_spec.rb +165 -163
  99. data/spec/rley/parse_rep/ambiguous_parse_spec.rb +44 -44
  100. data/spec/rley/parse_rep/ast_builder_spec.rb +6 -6
  101. data/spec/rley/parse_rep/cst_builder_spec.rb +5 -5
  102. data/spec/rley/parse_rep/groucho_spec.rb +21 -21
  103. data/spec/rley/parse_rep/parse_forest_builder_spec.rb +26 -26
  104. data/spec/rley/parse_rep/parse_forest_factory_spec.rb +6 -6
  105. data/spec/rley/parse_rep/parse_tree_factory_spec.rb +2 -2
  106. data/spec/rley/parse_tree_visitor_spec.rb +10 -8
  107. data/spec/rley/parser/error_reason_spec.rb +6 -6
  108. data/spec/rley/parser/gfg_earley_parser_spec.rb +4 -2
  109. data/spec/rley/parser/gfg_parsing_spec.rb +4 -8
  110. data/spec/rley/parser/parse_entry_spec.rb +19 -19
  111. data/spec/rley/parser/parse_state_spec.rb +5 -5
  112. data/spec/rley/parser/parse_walker_factory_spec.rb +1 -1
  113. data/spec/rley/parser/state_set_spec.rb +22 -22
  114. data/spec/rley/ptree/non_terminal_node_spec.rb +5 -3
  115. data/spec/rley/ptree/parse_tree_node_spec.rb +4 -4
  116. data/spec/rley/ptree/terminal_node_spec.rb +6 -6
  117. data/spec/rley/sppf/alternative_node_spec.rb +6 -6
  118. data/spec/rley/sppf/non_terminal_node_spec.rb +3 -3
  119. data/spec/rley/sppf/token_node_spec.rb +4 -4
  120. data/spec/rley/support/ambiguous_grammar_helper.rb +3 -4
  121. data/spec/rley/support/grammar_abc_helper.rb +2 -4
  122. data/spec/rley/support/grammar_ambig01_helper.rb +4 -5
  123. data/spec/rley/support/grammar_arr_int_helper.rb +4 -5
  124. data/spec/rley/support/grammar_b_expr_helper.rb +4 -5
  125. data/spec/rley/support/grammar_l0_helper.rb +10 -11
  126. data/spec/rley/support/grammar_pb_helper.rb +6 -5
  127. data/spec/rley/support/grammar_sppf_helper.rb +1 -1
  128. data/spec/rley/syntax/grammar_builder_spec.rb +5 -5
  129. data/spec/rley/syntax/grammar_spec.rb +6 -6
  130. data/spec/rley/syntax/grm_symbol_spec.rb +1 -1
  131. data/spec/rley/syntax/non_terminal_spec.rb +8 -8
  132. data/spec/rley/syntax/production_spec.rb +13 -13
  133. data/spec/rley/syntax/symbol_seq_spec.rb +2 -2
  134. data/spec/rley/syntax/terminal_spec.rb +5 -5
  135. data/spec/rley/syntax/verbatim_symbol_spec.rb +1 -1
  136. data/spec/spec_helper.rb +0 -12
  137. data/spec/support/base_tokenizer_spec.rb +7 -2
  138. metadata +21 -62
  139. data/.simplecov +0 -8
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2014-2019 Dimitri Geshef
1
+ Copyright (c) 2014-2021 Dimitri Geshef
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining a copy
4
4
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -90,7 +90,7 @@ directory
90
90
 
91
91
  # Let's create a facade object called 'engine'
92
92
  # It provides a unified, higher-level interface
93
- engine = Rley.Engine.new
93
+ engine = Rley::Engine.new
94
94
  ```
95
95
 
96
96
 
@@ -446,6 +446,7 @@ actively curated by Andrei Beliankou (aka arbox).
446
446
  ## Thanks to:
447
447
  * Professor Keshav Pingali, one of the creators of the Grammar Flow Graph parsing approach for his encouraging e-mail exchange.
448
448
  * [Arjun Menon](https://github.com/arjunmenon) for his NLP example that uses `engtagger` gem.
449
+ * [Gui Heurich](https://github.com/GuiHeurich) for spotting a mistake in the code sample in `README` file.
449
450
 
450
451
  ## Grammar Flow Graph
451
452
  Since the Grammar Flow Graph parsing approach is quite new, it has not yet taken a place in
@@ -458,5 +459,5 @@ standard parser textbooks. Here are a few references (and links) of papers on GF
458
459
 
459
460
  Copyright
460
461
  ---------
461
- Copyright (c) 2014-2018, Dimitri Geshef.
462
+ Copyright (c) 2014-2020, Dimitri Geshef.
462
463
  __Rley__ is released under the MIT License see [LICENSE.txt](https://github.com/famished-tiger/Rley/blob/master/LICENSE.txt) for details.
@@ -1,190 +1,193 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rley'
4
- require 'engtagger' # Load POS (Part-Of-Speech) tagger EngTagger
5
-
6
- # REGEX to remove XML tags from Engtagger output
7
- GET_TAG = /<(.+?)>(.*?)<.+?>/.freeze
8
-
9
- # Text tokenizer
10
- # Taken directly from Engtagger, will ensure uniform indexing while parsing
11
- def clean_text(text)
12
- return false unless valid_text(text)
13
-
14
- text = text.toutf8
15
- cleaned_text = text
16
- tokenized = []
17
- # Tokenize the text (splitting on punctuation as you go)
18
- cleaned_text.split(/\s+/).each do |line|
19
- tokenized += split_punct(line)
20
- end
21
- words = split_sentences(tokenized)
22
- return words
23
- end
24
-
25
- def valid_text(text)
26
- if !text
27
- # there's nothing to parse
28
- puts 'method call on uninitialized variable'
29
- return false
30
- elsif /\A\s*\z/ =~ text
31
- # text is an empty string, nothing to parse
32
- return false
33
- else
34
- # $text is valid
35
- return true
36
- end
37
- end
38
-
39
- def split_sentences(array)
40
- tokenized = array
41
- people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
42
- supt det mssrs rev]
43
- army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
44
- inst = %w[dept univ assn bros ph.d]
45
- place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
46
- hwy hway la pde pd plz pl rd st tce]
47
- comp = %w[mfg inc ltd co corp]
48
- state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
49
- ind ia kans kan ken ky la me md is mass mich minn miss mo mont
50
- neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
51
- va wash wis wisc wy wyo usafa alta man ont que sask yuk]
52
- month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
53
- misc = %w[vs etc no esp]
54
- abbr = {}
55
- [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
56
- abbr[i] = true
57
- end
58
- words = []
59
- tokenized.each_with_index do |_t, i|
60
- if tokenized[i + 1] &&
61
- tokenized [i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
62
- w = $1
63
- # Don't separate the period off words that
64
- # meet any of the following conditions:
65
- #
66
- # 1. It is defined in one of the lists above
67
- # 2. It is only one letter long: Alfred E. Sloan
68
- # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
69
- unless abbr[w.downcase] ||
70
- w =~ /\A[a-z]\z/i || w =~ /[a-z](?:\.[a-z])+\z/i
71
- words << w
72
- words << '.'
73
- next
74
- end
75
- end
76
- words << tokenized[i]
77
- end
78
- # If the final word ends in a period..
79
- if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
80
- words[-1] = $1
81
- words.push '.'
82
- end
83
- return words
84
- end
85
-
86
- # Separate punctuation from words, where appropriate. This leaves trailing
87
- # periods in place to be dealt with later. Called by the clean_text method.
88
- def split_punct(text)
89
- # If there's no punctuation, return immediately
90
- return [text] if /\A\w+\z/ =~ text
91
-
92
- # Sanity checks
93
- text = text.gsub(/\W{10,}/o, ' ')
94
-
95
- # Put quotes into a standard format
96
- text = text.gsub(/`(?!`)(?=.*\w)/o, '` ') # Shift left quotes off text
97
- text = text.gsub(/"(?=.*\w)/o, ' `` ') # Convert left quotes to ``
98
-
99
- # Convert left quote to `
100
- text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? $1 + ' ` ' : ' ` ' }
101
- text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
102
-
103
- # Separate right single quotes
104
- text = text.gsub(/(\w)'(?!')(?=\W|$)/o, "\\1 ' ")
105
-
106
- # Handle all other punctuation
107
- text = text.gsub(/--+/o, ' - ') # Convert and separate dashes
108
- text = text.gsub(/,(?!\d)/o, ' , ') # Shift comma if not following by digit
109
- text = text.gsub(/:/o, ' :') # Shift semicolon off
110
- text = text.gsub(/(\.\.\.+)/o, ' \1 ') # Shift ellipses off
111
- text = text.gsub(/([\(\[\{\}\]\)])/o, ' \1 ') # Shift off brackets
112
-
113
- # Shift off other ``standard'' punctuation
114
- text = text.gsub(/([\!\?#\$%;~|])/o, ' \1 ')
115
-
116
- # English-specific contractions
117
- # Separate off 'd 'm 's
118
- text = text.gsub(/([A-Za-z])'([dms])\b/o, "\\1 '\\2")
119
- text = text.gsub(/n't\b/o, " n't") # Separate off n't
120
- text = text.gsub(/'(ve|ll|re)\b/o, " '\\1") # Separate off 've, 'll, 're
121
- result = text.split(' ')
122
- return result
123
- end
124
-
125
-
126
- # Instantiate a facade object as our Rley interface
127
- nlp_engine = Rley::Engine.new
128
-
129
- # Now build a very simplified English grammar...
130
- nlp_engine.build_grammar do
131
- # Terminals have same names as POS tags returned by Engtagger
132
- add_terminals('NN', 'NNP')
133
- add_terminals('DET', 'IN', 'VBD')
134
-
135
- # Here we define the productions (= grammar rules)
136
- rule 'S' => %w[NP VP]
137
- rule 'NP' => 'NNP'
138
- rule 'NP' => %w[DET NN]
139
- rule 'NP' => %w[DET NN PP]
140
- rule 'VP' => %w[VBD NP]
141
- rule 'VP' => %w[VBD NP PP]
142
- rule 'PP' => %w[IN NP]
143
- end
144
-
145
- # text = "Yo I'm not done with you"
146
- text = 'John saw Mary with a telescope'
147
- puts "Input text --> #{text}"
148
-
149
- tgr = EngTagger.new
150
-
151
- # Generate raw POS output
152
- tagged = tgr.add_tags(text)
153
-
154
- # Generte tokenied lexicon of input text
155
- # Instead of creating a lexicon dictionary,
156
- # we would simply generate one each time on the fly for the current text only.
157
- lexicon = clean_text(text)
158
-
159
- # Convert EngTagger POS tokens in [[word, pos], ..] format
160
- tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
161
-
162
- def tokenizer(lexicon, tokens)
163
- pos = -1
164
- rley_tokens = []
165
- lexicon.each_with_index do |word, i|
166
- term_name = tokens[i].last
167
- rank = Rley::Lexical::Position.new(1, pos + 1)
168
- pos += word.length + 1 # Assuming one space between words.
169
- rley_tokens << Rley::Lexical::Token.new(word, term_name, rank)
170
- end
171
- return rley_tokens
172
- end
173
-
174
- # Convert input text into a sequence of rley token objects...
175
- rley_tokens = tokenizer(lexicon, tokens)
176
-
177
- # Let Rley grok the tokens
178
- result = nlp_engine.parse(rley_tokens)
179
-
180
- puts "Parsing successful? #{result.success?}" # => Parsing successful? true
181
- puts result.failure_reason.message unless result.success?
182
-
183
- ptree = nlp_engine.convert(result)
184
-
185
- visitor = nlp_engine.ptree_visitor(ptree)
186
-
187
- renderer = Rley::Formatter::Asciitree.new($stdout)
188
-
189
- # Let's visualize the parse tree (in text format...)
190
- puts renderer.render(visitor)
1
+ # frozen_string_literal: true
2
+
3
+ require 'rley'
4
+ require 'engtagger' # Load POS (Part-Of-Speech) tagger EngTagger
5
+
6
+ # REGEX to remove XML tags from Engtagger output
7
+ GET_TAG = /<(.+?)>(.*?)<.+?>/.freeze
8
+
9
+ # Text tokenizer
10
+ # Taken directly from Engtagger, will ensure uniform indexing while parsing
11
+ def clean_text(text)
12
+ return false unless valid_text(text)
13
+
14
+ text = text.toutf8
15
+ cleaned_text = text
16
+ tokenized = []
17
+ # Tokenize the text (splitting on punctuation as you go)
18
+ cleaned_text.split(/\s+/).each do |line|
19
+ tokenized += split_punct(line)
20
+ end
21
+ words = split_sentences(tokenized)
22
+ return words
23
+ end
24
+
25
+ def valid_text(text)
26
+ if !text
27
+ # there's nothing to parse
28
+ puts 'method call on uninitialized variable'
29
+ return false
30
+ elsif /\A\s*\z/ =~ text
31
+ # text is an empty string, nothing to parse
32
+ return false
33
+ else
34
+ # $text is valid
35
+ return true
36
+ end
37
+ end
38
+
39
+ def split_sentences(array)
40
+ # rubocop: disable Layout/ArrayAlignment
41
+ tokenized = array
42
+ people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
43
+ supt det mssrs rev]
44
+ army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
45
+ inst = %w[dept univ assn bros ph.d]
46
+ place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
47
+ hwy hway la pde pd plz pl rd st tce]
48
+ comp = %w[mfg inc ltd co corp]
49
+ state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
50
+ ind ia kans kan ken ky la me md is mass mich minn miss mo mont
51
+ neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
52
+ va wash wis wisc wy wyo usafa alta man ont que sask yuk]
53
+ month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
54
+ misc = %w[vs etc no esp]
55
+ abbr = {}
56
+ [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
57
+ abbr[i] = true
58
+ end
59
+ words = []
60
+ tokenized.each_with_index do |_t, i|
61
+ if tokenized[i + 1] &&
62
+ tokenized[i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
63
+ w = $1
64
+ # Don't separate the period off words that
65
+ # meet any of the following conditions:
66
+ #
67
+ # 1. It is defined in one of the lists above
68
+ # 2. It is only one letter long: Alfred E. Sloan
69
+ # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
70
+ unless abbr[w.downcase] ||
71
+ w =~ /\A[a-z]\z/i || w =~ /[a-z](?:\.[a-z])+\z/i
72
+ words << w
73
+ words << '.'
74
+ next
75
+ end
76
+ end
77
+ words << tokenized[i]
78
+ end
79
+
80
+ # If the final word ends in a period..
81
+ if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
82
+ words[-1] = $1
83
+ words.push '.'
84
+ end
85
+ words
86
+ end
87
+ # rubocop: enable Layout/ArrayAlignment
88
+
89
+ # Separate punctuation from words, where appropriate. This leaves trailing
90
+ # periods in place to be dealt with later. Called by the clean_text method.
91
+ def split_punct(text)
92
+ # If there's no punctuation, return immediately
93
+ return [text] if /\A\w+\z/ =~ text
94
+
95
+ # Sanity checks
96
+ text = text.gsub(/\W{10,}/o, ' ')
97
+
98
+ # Put quotes into a standard format
99
+ text = text.gsub(/`(?!`)(?=.*\w)/o, '` ') # Shift left quotes off text
100
+ text = text.gsub(/"(?=.*\w)/o, ' `` ') # Convert left quotes to ``
101
+
102
+ # Convert left quote to `
103
+ text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? "#{$1} ` " : ' ` ' }
104
+ text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
105
+
106
+ # Separate right single quotes
107
+ text = text.gsub(/(\w)'(?!')(?=\W|$)/o, "\\1 ' ")
108
+
109
+ # Handle all other punctuation
110
+ text = text.gsub(/--+/o, ' - ') # Convert and separate dashes
111
+ text = text.gsub(/,(?!\d)/o, ' , ') # Shift comma if not following by digit
112
+ text = text.gsub(/:/o, ' :') # Shift semicolon off
113
+ text = text.gsub(/(\.\.\.+)/o, ' \1 ') # Shift ellipses off
114
+ text = text.gsub(/([(\[{}\])])/o, ' \1 ') # Shift off brackets
115
+
116
+ # Shift off other ``standard'' punctuation
117
+ text = text.gsub(/([!?#$%;~|])/o, ' \1 ')
118
+
119
+ # English-specific contractions
120
+ # Separate off 'd 'm 's
121
+ text = text.gsub(/([A-Za-z])'([dms])\b/o, "\\1 '\\2")
122
+ text = text.gsub(/n't\b/o, " n't") # Separate off n't
123
+ text = text.gsub(/'(ve|ll|re)\b/o, " '\\1") # Separate off 've, 'll, 're
124
+ result = text.split
125
+ return result
126
+ end
127
+
128
+
129
+ # Instantiate a facade object as our Rley interface
130
+ nlp_engine = Rley::Engine.new
131
+
132
+ # Now build a very simplified English grammar...
133
+ nlp_engine.build_grammar do
134
+ # Terminals have same names as POS tags returned by Engtagger
135
+ add_terminals('NN', 'NNP')
136
+ add_terminals('DET', 'IN', 'VBD')
137
+
138
+ # Here we define the productions (= grammar rules)
139
+ rule 'S' => %w[NP VP]
140
+ rule 'NP' => 'NNP'
141
+ rule 'NP' => %w[DET NN]
142
+ rule 'NP' => %w[DET NN PP]
143
+ rule 'VP' => %w[VBD NP]
144
+ rule 'VP' => %w[VBD NP PP]
145
+ rule 'PP' => %w[IN NP]
146
+ end
147
+
148
+ # text = "Yo I'm not done with you"
149
+ text = 'John saw Mary with a telescope'
150
+ puts "Input text --> #{text}"
151
+
152
+ tgr = EngTagger.new
153
+
154
+ # Generate raw POS output
155
+ tagged = tgr.add_tags(text)
156
+
157
+ # Generte tokenied lexicon of input text
158
+ # Instead of creating a lexicon dictionary,
159
+ # we would simply generate one each time on the fly for the current text only.
160
+ lexicon = clean_text(text)
161
+
162
+ # Convert EngTagger POS tokens in [[word, pos], ..] format
163
+ tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
164
+
165
+ def tokenizer(lexicon, tokens)
166
+ pos = -1
167
+ rley_tokens = []
168
+ lexicon.each_with_index do |word, i|
169
+ term_name = tokens[i].last
170
+ rank = Rley::Lexical::Position.new(1, pos + 1)
171
+ pos += word.length + 1 # Assuming one space between words.
172
+ rley_tokens << Rley::Lexical::Token.new(word, term_name, rank)
173
+ end
174
+ return rley_tokens
175
+ end
176
+
177
+ # Convert input text into a sequence of rley token objects...
178
+ rley_tokens = tokenizer(lexicon, tokens)
179
+
180
+ # Let Rley grok the tokens
181
+ result = nlp_engine.parse(rley_tokens)
182
+
183
+ puts "Parsing successful? #{result.success?}" # => Parsing successful? true
184
+ puts result.failure_reason.message unless result.success?
185
+
186
+ ptree = nlp_engine.convert(result)
187
+
188
+ visitor = nlp_engine.ptree_visitor(ptree)
189
+
190
+ renderer = Rley::Formatter::Asciitree.new($stdout)
191
+
192
+ # Let's visualize the parse tree (in text format...)
193
+ puts renderer.render(visitor)
@@ -11,7 +11,7 @@ require 'rley' # Load the gem
11
11
  # based on chapter 12 from Jurafski & Martin book.
12
12
  # Daniel Jurafsky,‎ James H. Martin: "Speech and Language Processing";
13
13
  # 2009, Pearson Education, Inc., ISBN 978-0135041963
14
- # It defines the syntax of a sentence in a mini English-like language
14
+ # It defines the syntax of a sentence in a mini English-like language
15
15
  builder = Rley::Syntax::GrammarBuilder.new do
16
16
  add_terminals('Pronoun', 'Proper-Noun')
17
17
  add_terminals('Determiner', 'Noun')
@@ -21,7 +21,7 @@ builder = Rley::Syntax::GrammarBuilder.new do
21
21
 
22
22
  rule 'language' => 'sentence'
23
23
  rule 'sentence' => 'declarative'
24
- rule 'sentence' => 'imperative'
24
+ rule 'sentence' => 'imperative'
25
25
  rule 'sentence' => 'yes_no_question'
26
26
  rule 'sentence' => 'wh_subject_question'
27
27
  rule 'sentence' => 'wh_non_subject_question'
@@ -33,7 +33,7 @@ builder = Rley::Syntax::GrammarBuilder.new do
33
33
  rule 'NP' => %w[Predeterminer NP]
34
34
  rule 'NP' => 'Pronoun'
35
35
  rule 'NP' => 'Proper-Noun'
36
- rule 'NP' => %w[Det Card Ord Quant Nominal]
36
+ rule 'NP' => %w[Det Card Ord Quant Nominal]
37
37
  rule 'VP' => 'Verb'
38
38
  rule 'VP' => %w[Verb NP]
39
39
  rule 'VP' => %w[Verb NP PP]
@@ -43,12 +43,12 @@ builder = Rley::Syntax::GrammarBuilder.new do
43
43
  rule 'Card' => 'Cardinal_number'
44
44
  rule 'Card' => []
45
45
  rule 'Ord' => 'Ordinal_number'
46
- rule 'Ord' => []
46
+ rule 'Ord' => []
47
47
  rule 'Nominal' => 'Noun'
48
48
  rule 'Nominal' => %w[Nominal Noun]
49
49
  rule 'Nominal' => %w[Nominal GerundVP]
50
50
  rule 'Nominal' => %w[Nominal RelClause]
51
- rule 'PP' => %w[Preposition NP]
51
+ rule 'PP' => %w[Preposition NP]
52
52
  rule 'GerundVP' => 'GerundV'
53
53
  rule 'GerundVP' => %w[GerundV NP]
54
54
  rule 'GerundVP' => %w[GerundV NP PP]