rley 0.7.06 → 0.8.01

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +362 -62
  3. data/.travis.yml +6 -6
  4. data/CHANGELOG.md +20 -4
  5. data/LICENSE.txt +1 -1
  6. data/README.md +7 -7
  7. data/examples/NLP/engtagger.rb +193 -190
  8. data/examples/NLP/nano_eng/nano_en_demo.rb +7 -11
  9. data/examples/NLP/nano_eng/nano_grammar.rb +21 -21
  10. data/examples/NLP/pico_en_demo.rb +2 -2
  11. data/examples/data_formats/JSON/cli_options.rb +1 -1
  12. data/examples/data_formats/JSON/json_ast_builder.rb +21 -27
  13. data/examples/data_formats/JSON/json_ast_nodes.rb +12 -21
  14. data/examples/data_formats/JSON/json_demo.rb +1 -2
  15. data/examples/data_formats/JSON/json_grammar.rb +13 -13
  16. data/examples/data_formats/JSON/json_lexer.rb +8 -8
  17. data/examples/data_formats/JSON/json_minifier.rb +1 -1
  18. data/examples/general/calc_iter1/calc_ast_builder.rb +13 -10
  19. data/examples/general/calc_iter1/calc_ast_nodes.rb +23 -37
  20. data/examples/general/calc_iter1/calc_grammar.rb +7 -6
  21. data/examples/general/calc_iter1/calc_lexer.rb +6 -4
  22. data/examples/general/calc_iter1/spec/calculator_spec.rb +5 -5
  23. data/examples/general/calc_iter2/calc_ast_builder.rb +5 -3
  24. data/examples/general/calc_iter2/calc_ast_nodes.rb +27 -43
  25. data/examples/general/calc_iter2/calc_grammar.rb +12 -12
  26. data/examples/general/calc_iter2/calc_lexer.rb +11 -10
  27. data/examples/general/calc_iter2/spec/calculator_spec.rb +26 -26
  28. data/examples/general/left.rb +2 -2
  29. data/examples/general/right.rb +2 -2
  30. data/lib/rley.rb +1 -1
  31. data/lib/rley/base/dotted_item.rb +28 -31
  32. data/lib/rley/base/grm_items_builder.rb +6 -0
  33. data/lib/rley/constants.rb +2 -2
  34. data/lib/rley/engine.rb +22 -25
  35. data/lib/rley/formatter/asciitree.rb +3 -3
  36. data/lib/rley/formatter/bracket_notation.rb +1 -8
  37. data/lib/rley/formatter/debug.rb +6 -6
  38. data/lib/rley/formatter/json.rb +2 -2
  39. data/lib/rley/gfg/call_edge.rb +1 -1
  40. data/lib/rley/gfg/edge.rb +5 -5
  41. data/lib/rley/gfg/end_vertex.rb +2 -6
  42. data/lib/rley/gfg/epsilon_edge.rb +1 -5
  43. data/lib/rley/gfg/grm_flow_graph.rb +27 -23
  44. data/lib/rley/gfg/item_vertex.rb +10 -10
  45. data/lib/rley/gfg/non_terminal_vertex.rb +4 -4
  46. data/lib/rley/gfg/scan_edge.rb +1 -1
  47. data/lib/rley/gfg/shortcut_edge.rb +2 -2
  48. data/lib/rley/gfg/start_vertex.rb +4 -8
  49. data/lib/rley/gfg/vertex.rb +43 -39
  50. data/lib/rley/interface.rb +16 -0
  51. data/lib/rley/lexical/token_range.rb +6 -6
  52. data/lib/rley/notation/all_notation_nodes.rb +2 -0
  53. data/lib/rley/notation/ast_builder.rb +191 -0
  54. data/lib/rley/notation/ast_node.rb +44 -0
  55. data/lib/rley/notation/ast_visitor.rb +113 -0
  56. data/lib/rley/notation/grammar.rb +49 -0
  57. data/lib/rley/notation/grammar_builder.rb +504 -0
  58. data/lib/rley/notation/grouping_node.rb +23 -0
  59. data/lib/rley/notation/parser.rb +56 -0
  60. data/lib/rley/notation/sequence_node.rb +35 -0
  61. data/lib/rley/notation/symbol_node.rb +29 -0
  62. data/lib/rley/notation/tokenizer.rb +192 -0
  63. data/lib/rley/parse_forest_visitor.rb +5 -5
  64. data/lib/rley/parse_rep/ast_base_builder.rb +48 -11
  65. data/lib/rley/parse_rep/cst_builder.rb +5 -6
  66. data/lib/rley/parse_rep/parse_forest_builder.rb +22 -18
  67. data/lib/rley/parse_rep/parse_forest_factory.rb +3 -3
  68. data/lib/rley/parse_rep/parse_rep_creator.rb +14 -16
  69. data/lib/rley/parse_rep/parse_tree_builder.rb +4 -4
  70. data/lib/rley/parse_rep/parse_tree_factory.rb +27 -27
  71. data/lib/rley/parse_tree_visitor.rb +1 -1
  72. data/lib/rley/parser/error_reason.rb +4 -5
  73. data/lib/rley/parser/gfg_chart.rb +118 -26
  74. data/lib/rley/parser/gfg_parsing.rb +22 -33
  75. data/lib/rley/parser/parse_entry.rb +25 -31
  76. data/lib/rley/parser/parse_entry_set.rb +19 -16
  77. data/lib/rley/parser/parse_entry_tracker.rb +4 -4
  78. data/lib/rley/parser/parse_tracer.rb +13 -13
  79. data/lib/rley/parser/parse_walker_factory.rb +23 -28
  80. data/lib/rley/ptree/non_terminal_node.rb +7 -5
  81. data/lib/rley/ptree/parse_tree.rb +3 -3
  82. data/lib/rley/ptree/parse_tree_node.rb +5 -5
  83. data/lib/rley/ptree/terminal_node.rb +7 -7
  84. data/lib/rley/rley_error.rb +12 -12
  85. data/lib/rley/sppf/alternative_node.rb +6 -6
  86. data/lib/rley/sppf/composite_node.rb +7 -7
  87. data/lib/rley/sppf/epsilon_node.rb +3 -3
  88. data/lib/rley/sppf/leaf_node.rb +3 -3
  89. data/lib/rley/sppf/parse_forest.rb +16 -16
  90. data/lib/rley/sppf/sppf_node.rb +7 -8
  91. data/lib/rley/sppf/token_node.rb +3 -3
  92. data/lib/rley/syntax/{grammar_builder.rb → base_grammar_builder.rb} +61 -23
  93. data/lib/rley/syntax/grammar.rb +5 -5
  94. data/lib/rley/syntax/grm_symbol.rb +7 -7
  95. data/lib/rley/syntax/match_closest.rb +43 -0
  96. data/lib/rley/syntax/non_terminal.rb +9 -15
  97. data/lib/rley/syntax/production.rb +16 -10
  98. data/lib/rley/syntax/symbol_seq.rb +7 -9
  99. data/lib/rley/syntax/terminal.rb +4 -5
  100. data/lib/rley/syntax/verbatim_symbol.rb +3 -3
  101. data/lib/support/base_tokenizer.rb +19 -18
  102. data/spec/rley/base/dotted_item_spec.rb +2 -2
  103. data/spec/rley/engine_spec.rb +23 -21
  104. data/spec/rley/formatter/asciitree_spec.rb +7 -7
  105. data/spec/rley/formatter/bracket_notation_spec.rb +13 -13
  106. data/spec/rley/formatter/json_spec.rb +1 -1
  107. data/spec/rley/gfg/end_vertex_spec.rb +5 -5
  108. data/spec/rley/gfg/grm_flow_graph_spec.rb +2 -2
  109. data/spec/rley/gfg/item_vertex_spec.rb +10 -10
  110. data/spec/rley/gfg/non_terminal_vertex_spec.rb +3 -3
  111. data/spec/rley/gfg/shortcut_edge_spec.rb +1 -1
  112. data/spec/rley/gfg/start_vertex_spec.rb +5 -5
  113. data/spec/rley/gfg/vertex_spec.rb +3 -3
  114. data/spec/rley/lexical/token_range_spec.rb +16 -16
  115. data/spec/rley/lexical/token_spec.rb +2 -2
  116. data/spec/rley/notation/grammar_builder_spec.rb +302 -0
  117. data/spec/rley/notation/parser_spec.rb +184 -0
  118. data/spec/rley/notation/tokenizer_spec.rb +370 -0
  119. data/spec/rley/parse_forest_visitor_spec.rb +165 -163
  120. data/spec/rley/parse_rep/ambiguous_parse_spec.rb +44 -44
  121. data/spec/rley/parse_rep/ast_builder_spec.rb +6 -7
  122. data/spec/rley/parse_rep/cst_builder_spec.rb +5 -5
  123. data/spec/rley/parse_rep/groucho_spec.rb +24 -26
  124. data/spec/rley/parse_rep/parse_forest_builder_spec.rb +27 -27
  125. data/spec/rley/parse_rep/parse_forest_factory_spec.rb +8 -8
  126. data/spec/rley/parse_rep/parse_tree_factory_spec.rb +3 -3
  127. data/spec/rley/parse_tree_visitor_spec.rb +10 -8
  128. data/spec/rley/parser/dangling_else_spec.rb +445 -0
  129. data/spec/rley/parser/error_reason_spec.rb +6 -6
  130. data/spec/rley/parser/gfg_earley_parser_spec.rb +120 -12
  131. data/spec/rley/parser/gfg_parsing_spec.rb +6 -13
  132. data/spec/rley/parser/parse_entry_spec.rb +19 -19
  133. data/spec/rley/parser/parse_walker_factory_spec.rb +10 -10
  134. data/spec/rley/ptree/non_terminal_node_spec.rb +5 -3
  135. data/spec/rley/ptree/parse_tree_node_spec.rb +4 -4
  136. data/spec/rley/ptree/terminal_node_spec.rb +6 -6
  137. data/spec/rley/sppf/alternative_node_spec.rb +6 -6
  138. data/spec/rley/sppf/non_terminal_node_spec.rb +3 -3
  139. data/spec/rley/sppf/token_node_spec.rb +4 -4
  140. data/spec/rley/support/ambiguous_grammar_helper.rb +4 -5
  141. data/spec/rley/support/grammar_abc_helper.rb +3 -5
  142. data/spec/rley/support/grammar_ambig01_helper.rb +5 -6
  143. data/spec/rley/support/grammar_arr_int_helper.rb +5 -6
  144. data/spec/rley/support/grammar_b_expr_helper.rb +5 -6
  145. data/spec/rley/support/grammar_int_seq_helper.rb +51 -0
  146. data/spec/rley/support/grammar_l0_helper.rb +14 -17
  147. data/spec/rley/support/grammar_pb_helper.rb +8 -7
  148. data/spec/rley/support/grammar_sppf_helper.rb +3 -3
  149. data/spec/rley/syntax/{grammar_builder_spec.rb → base_grammar_builder_spec.rb} +35 -16
  150. data/spec/rley/syntax/grammar_spec.rb +6 -6
  151. data/spec/rley/syntax/grm_symbol_spec.rb +1 -1
  152. data/spec/rley/syntax/match_closest_spec.rb +46 -0
  153. data/spec/rley/syntax/non_terminal_spec.rb +8 -8
  154. data/spec/rley/syntax/production_spec.rb +17 -13
  155. data/spec/rley/syntax/symbol_seq_spec.rb +2 -2
  156. data/spec/rley/syntax/terminal_spec.rb +5 -5
  157. data/spec/rley/syntax/verbatim_symbol_spec.rb +1 -1
  158. data/spec/spec_helper.rb +0 -12
  159. data/spec/support/base_tokenizer_spec.rb +7 -2
  160. metadata +48 -74
  161. data/.simplecov +0 -7
  162. data/lib/rley/parser/parse_state.rb +0 -83
  163. data/lib/rley/parser/parse_state_tracker.rb +0 -59
  164. data/lib/rley/parser/state_set.rb +0 -101
  165. data/spec/rley/parser/parse_state_spec.rb +0 -125
  166. data/spec/rley/parser/parse_tracer_spec.rb +0 -200
  167. data/spec/rley/parser/state_set_spec.rb +0 -130
data/.travis.yml CHANGED
@@ -9,13 +9,13 @@ script:
9
9
  - bundle exec rake
10
10
 
11
11
  rvm:
12
- - 2.6.3
13
- - 2.5.5
14
- - 2.4.6
15
- - 2.3.8
12
+ - 2.7.1
13
+ - 2.6.6
14
+ - 2.5.8
15
+ - 2.4.10
16
16
  - ruby-head
17
- - jruby-9.1.9.0
18
- - jruby-head
17
+ - jruby-head
18
+ before_install: gem install bundler -v 2.0.2
19
19
 
20
20
  matrix:
21
21
  allow_failures:
data/CHANGELOG.md CHANGED
@@ -1,3 +1,19 @@
1
+ ### 0.8.01 / 2021-08-22
2
+ - Unused/redundant file removal. Fix in rule generation
3
+
4
+ * [CHANGE] Removal of files in repository that were redundant/useless.
5
+ * [FIX] The rule ordering was broken by the rules implicitly generated by Rlry
6
+
7
+ ### 0.8.00 / 2021-08-15
8
+ - New grammar builder that accepts ? * + modifiers
9
+
10
+ ### 0.7.08 / 2021-05-30
11
+ - Code restyling to please rubocop 1.15.0
12
+
13
+ ### 0.7.07 / 2020-11-16
14
+ - Code restyling to please rubocop 0.93.1
15
+ * [CHANGE] File `.travis.yml`: updated Ruby versions, drop support for Ruby 2.3.x
16
+
1
17
  ### 0.7.06 / 2019-11-22
2
18
  - [FIX] Method `ParseForestBuilder#process_end_entry`: Added a guard expression to prevent nil error.
3
19
 
@@ -5,10 +21,10 @@
5
21
  - [FIX] Method `GFGParsing#nullable_rule`: issue with nullable productions having at least one member in their rhs.
6
22
 
7
23
  ### 0.7.04 / 2019-08-17
8
- - Rley recognizer is about 25% faster than previous version. Kudos to the people
9
- behind the *magic_frozen_string_literal* gem.
24
+ - Rley recognizer is about 25% faster than previous version. Kudos to the people
25
+ behind the *magic_frozen_string_literal* gem.
10
26
  - Code refactoring to use string frozen magic comments (as a consequence, Rley runs only on Rubies 2.3 or newer).
11
- - Code restyling to please rubocop 0.7.40.
27
+ - Code restyling to please rubocop 0.7.40.
12
28
  - [CHANGE] Class `ParseEntrySet`: minor code optimization
13
29
  - [CHANGE] File `README.md` removed allusion to Ruby 2.0.x up to 2.2.x.
14
30
  - [CHANGE] File `README.md` added Ruby 2.6.x up as supported version.
@@ -93,7 +109,7 @@ behind the *magic_frozen_string_literal* gem.
93
109
  * [FIX] Code re-styling to remove most style offenses found by Rubocop 0.52.1
94
110
 
95
111
  ### 0.6.00 / 2018-02-25
96
- Version bump. Highlights: new programming interface through facade object, improved AST generation.
112
+ Version bump. Highlights: new programming interface through facade object, improved AST generation.
97
113
  * [NEW] Class `Rley::Engine`: Implementation of Facade design pattern to reach more convenient interface.
98
114
  * [NEW] Class `Rley::ParseRep::ASTBaseBuilder` Abstract class that simplifies the creation of custom AST (Abstract Syntax Tree)
99
115
  * [NEW] Module `Rley::ParseRep` hosts the classes for building parse representations (parse trees and forests)
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2014-2019 Dimitri Geshef
1
+ Copyright (c) 2014-2021 Dimitri Geshef
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining a copy
4
4
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -50,11 +50,10 @@ application range such as:
50
50
 
51
51
 
52
52
  ### Compatibility
53
- Rley supports the following Ruby implementations:
54
- - MRI 2.3
55
- - MRI 2.4
53
+ Rley supports the following Ruby implementations:
56
54
  - MRI 2.5
57
55
  - MRI 2.6
56
+ - MRI 2.7
58
57
  - JRuby 9.1+
59
58
 
60
59
  ---
@@ -90,7 +89,7 @@ directory
90
89
 
91
90
  # Let's create a facade object called 'engine'
92
91
  # It provides a unified, higher-level interface
93
- engine = Rley.Engine.new
92
+ engine = Rley::Engine.new
94
93
  ```
95
94
 
96
95
 
@@ -106,9 +105,9 @@ The subset of English grammar is based on an example from the NLTK book.
106
105
  # Here we define the productions (= grammar rules)
107
106
  rule 'S' => 'NP VP'
108
107
  rule 'NP' => 'Proper-Noun'
109
- rule 'NP' => 'Determiner Noun'
108
+ rule 'NP' => 'Determiner Noun'
110
109
  rule 'NP' => 'Determiner Noun PP'
111
- rule 'VP' => 'Verb NP'
110
+ rule 'VP' => 'Verb NP'
112
111
  rule 'VP' => 'Verb NP PP'
113
112
  rule 'PP' => 'Preposition NP'
114
113
  end
@@ -446,6 +445,7 @@ actively curated by Andrei Beliankou (aka arbox).
446
445
  ## Thanks to:
447
446
  * Professor Keshav Pingali, one of the creators of the Grammar Flow Graph parsing approach for his encouraging e-mail exchange.
448
447
  * [Arjun Menon](https://github.com/arjunmenon) for his NLP example that uses `engtagger` gem.
448
+ * [Gui Heurich](https://github.com/GuiHeurich) for spotting a mistake in the code sample in `README` file.
449
449
 
450
450
  ## Grammar Flow Graph
451
451
  Since the Grammar Flow Graph parsing approach is quite new, it has not yet taken a place in
@@ -458,5 +458,5 @@ standard parser textbooks. Here are a few references (and links) of papers on GF
458
458
 
459
459
  Copyright
460
460
  ---------
461
- Copyright (c) 2014-2018, Dimitri Geshef.
461
+ Copyright (c) 2014-2020, Dimitri Geshef.
462
462
  __Rley__ is released under the MIT License see [LICENSE.txt](https://github.com/famished-tiger/Rley/blob/master/LICENSE.txt) for details.
@@ -1,190 +1,193 @@
1
- # frozen_string_literal: true
2
-
3
- require 'rley'
4
- require 'engtagger' # Load POS (Part-Of-Speech) tagger EngTagger
5
-
6
- # REGEX to remove XML tags from Engtagger output
7
- GET_TAG = /<(.+?)>(.*?)<.+?>/.freeze
8
-
9
- # Text tokenizer
10
- # Taken directly from Engtagger, will ensure uniform indexing while parsing
11
- def clean_text(text)
12
- return false unless valid_text(text)
13
-
14
- text = text.toutf8
15
- cleaned_text = text
16
- tokenized = []
17
- # Tokenize the text (splitting on punctuation as you go)
18
- cleaned_text.split(/\s+/).each do |line|
19
- tokenized += split_punct(line)
20
- end
21
- words = split_sentences(tokenized)
22
- return words
23
- end
24
-
25
- def valid_text(text)
26
- if !text
27
- # there's nothing to parse
28
- puts 'method call on uninitialized variable'
29
- return false
30
- elsif /\A\s*\z/ =~ text
31
- # text is an empty string, nothing to parse
32
- return false
33
- else
34
- # $text is valid
35
- return true
36
- end
37
- end
38
-
39
- def split_sentences(array)
40
- tokenized = array
41
- people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
42
- supt det mssrs rev]
43
- army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
44
- inst = %w[dept univ assn bros ph.d]
45
- place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
46
- hwy hway la pde pd plz pl rd st tce]
47
- comp = %w[mfg inc ltd co corp]
48
- state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
49
- ind ia kans kan ken ky la me md is mass mich minn miss mo mont
50
- neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
51
- va wash wis wisc wy wyo usafa alta man ont que sask yuk]
52
- month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
53
- misc = %w[vs etc no esp]
54
- abbr = {}
55
- [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
56
- abbr[i] = true
57
- end
58
- words = []
59
- tokenized.each_with_index do |_t, i|
60
- if tokenized[i + 1] &&
61
- tokenized [i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
62
- w = $1
63
- # Don't separate the period off words that
64
- # meet any of the following conditions:
65
- #
66
- # 1. It is defined in one of the lists above
67
- # 2. It is only one letter long: Alfred E. Sloan
68
- # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
69
- unless abbr[w.downcase] ||
70
- w =~ /\A[a-z]\z/i || w =~ /[a-z](?:\.[a-z])+\z/i
71
- words << w
72
- words << '.'
73
- next
74
- end
75
- end
76
- words << tokenized[i]
77
- end
78
- # If the final word ends in a period..
79
- if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
80
- words[-1] = $1
81
- words.push '.'
82
- end
83
- return words
84
- end
85
-
86
- # Separate punctuation from words, where appropriate. This leaves trailing
87
- # periods in place to be dealt with later. Called by the clean_text method.
88
- def split_punct(text)
89
- # If there's no punctuation, return immediately
90
- return [text] if /\A\w+\z/ =~ text
91
-
92
- # Sanity checks
93
- text = text.gsub(/\W{10,}/o, ' ')
94
-
95
- # Put quotes into a standard format
96
- text = text.gsub(/`(?!`)(?=.*\w)/o, '` ') # Shift left quotes off text
97
- text = text.gsub(/"(?=.*\w)/o, ' `` ') # Convert left quotes to ``
98
-
99
- # Convert left quote to `
100
- text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? $1 + ' ` ' : ' ` ' }
101
- text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
102
-
103
- # Separate right single quotes
104
- text = text.gsub(/(\w)'(?!')(?=\W|$)/o, "\\1 ' ")
105
-
106
- # Handle all other punctuation
107
- text = text.gsub(/--+/o, ' - ') # Convert and separate dashes
108
- text = text.gsub(/,(?!\d)/o, ' , ') # Shift comma if not following by digit
109
- text = text.gsub(/:/o, ' :') # Shift semicolon off
110
- text = text.gsub(/(\.\.\.+)/o, ' \1 ') # Shift ellipses off
111
- text = text.gsub(/([\(\[\{\}\]\)])/o, ' \1 ') # Shift off brackets
112
-
113
- # Shift off other ``standard'' punctuation
114
- text = text.gsub(/([\!\?#\$%;~|])/o, ' \1 ')
115
-
116
- # English-specific contractions
117
- # Separate off 'd 'm 's
118
- text = text.gsub(/([A-Za-z])'([dms])\b/o, "\\1 '\\2")
119
- text = text.gsub(/n't\b/o, " n't") # Separate off n't
120
- text = text.gsub(/'(ve|ll|re)\b/o, " '\\1") # Separate off 've, 'll, 're
121
- result = text.split(' ')
122
- return result
123
- end
124
-
125
-
126
- # Instantiate a facade object as our Rley interface
127
- nlp_engine = Rley::Engine.new
128
-
129
- # Now build a very simplified English grammar...
130
- nlp_engine.build_grammar do
131
- # Terminals have same names as POS tags returned by Engtagger
132
- add_terminals('NN', 'NNP')
133
- add_terminals('DET', 'IN', 'VBD')
134
-
135
- # Here we define the productions (= grammar rules)
136
- rule 'S' => %w[NP VP]
137
- rule 'NP' => 'NNP'
138
- rule 'NP' => %w[DET NN]
139
- rule 'NP' => %w[DET NN PP]
140
- rule 'VP' => %w[VBD NP]
141
- rule 'VP' => %w[VBD NP PP]
142
- rule 'PP' => %w[IN NP]
143
- end
144
-
145
- # text = "Yo I'm not done with you"
146
- text = 'John saw Mary with a telescope'
147
- puts "Input text --> #{text}"
148
-
149
- tgr = EngTagger.new
150
-
151
- # Generate raw POS output
152
- tagged = tgr.add_tags(text)
153
-
154
- # Generte tokenied lexicon of input text
155
- # Instead of creating a lexicon dictionary,
156
- # we would simply generate one each time on the fly for the current text only.
157
- lexicon = clean_text(text)
158
-
159
- # Convert EngTagger POS tokens in [[word, pos], ..] format
160
- tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
161
-
162
- def tokenizer(lexicon, tokens)
163
- pos = -1
164
- rley_tokens = []
165
- lexicon.each_with_index do |word, i|
166
- term_name = tokens[i].last
167
- rank = Rley::Lexical::Position.new(1, pos + 1)
168
- pos += word.length + 1 # Assuming one space between words.
169
- rley_tokens << Rley::Lexical::Token.new(word, term_name, rank)
170
- end
171
- return rley_tokens
172
- end
173
-
174
- # Convert input text into a sequence of rley token objects...
175
- rley_tokens = tokenizer(lexicon, tokens)
176
-
177
- # Let Rley grok the tokens
178
- result = nlp_engine.parse(rley_tokens)
179
-
180
- puts "Parsing successful? #{result.success?}" # => Parsing successful? true
181
- puts result.failure_reason.message unless result.success?
182
-
183
- ptree = nlp_engine.convert(result)
184
-
185
- visitor = nlp_engine.ptree_visitor(ptree)
186
-
187
- renderer = Rley::Formatter::Asciitree.new($stdout)
188
-
189
- # Let's visualize the parse tree (in text format...)
190
- puts renderer.render(visitor)
1
+ # frozen_string_literal: true
2
+
3
+ require 'rley'
4
+ require 'engtagger' # Load POS (Part-Of-Speech) tagger EngTagger
5
+
6
+ # REGEX to remove XML tags from Engtagger output
7
+ GET_TAG = /<(.+?)>(.*?)<.+?>/.freeze
8
+
9
+ # Text tokenizer
10
+ # Taken directly from Engtagger, will ensure uniform indexing while parsing
11
+ def clean_text(text)
12
+ return false unless valid_text(text)
13
+
14
+ text = text.toutf8
15
+ cleaned_text = text
16
+ tokenized = []
17
+ # Tokenize the text (splitting on punctuation as you go)
18
+ cleaned_text.split(/\s+/).each do |line|
19
+ tokenized += split_punct(line)
20
+ end
21
+ words = split_sentences(tokenized)
22
+ return words
23
+ end
24
+
25
+ def valid_text(text)
26
+ if !text
27
+ # there's nothing to parse
28
+ puts 'method call on uninitialized variable'
29
+ return false
30
+ elsif /\A\s*\z/ =~ text
31
+ # text is an empty string, nothing to parse
32
+ return false
33
+ else
34
+ # $text is valid
35
+ return true
36
+ end
37
+ end
38
+
39
+ def split_sentences(array)
40
+ # rubocop: disable Layout/ArrayAlignment
41
+ tokenized = array
42
+ people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
43
+ supt det mssrs rev]
44
+ army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
45
+ inst = %w[dept univ assn bros ph.d]
46
+ place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
47
+ hwy hway la pde pd plz pl rd st tce]
48
+ comp = %w[mfg inc ltd co corp]
49
+ state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
50
+ ind ia kans kan ken ky la me md is mass mich minn miss mo mont
51
+ neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
52
+ va wash wis wisc wy wyo usafa alta man ont que sask yuk]
53
+ month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
54
+ misc = %w[vs etc no esp]
55
+ abbr = {}
56
+ [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
57
+ abbr[i] = true
58
+ end
59
+ words = []
60
+ tokenized.each_with_index do |_t, i|
61
+ if tokenized[i + 1] &&
62
+ tokenized[i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
63
+ w = $1
64
+ # Don't separate the period off words that
65
+ # meet any of the following conditions:
66
+ #
67
+ # 1. It is defined in one of the lists above
68
+ # 2. It is only one letter long: Alfred E. Sloan
69
+ # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
70
+ unless abbr[w.downcase] ||
71
+ w =~ /\A[a-z]\z/i || w =~ /[a-z](?:\.[a-z])+\z/i
72
+ words << w
73
+ words << '.'
74
+ next
75
+ end
76
+ end
77
+ words << tokenized[i]
78
+ end
79
+
80
+ # If the final word ends in a period..
81
+ if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
82
+ words[-1] = $1
83
+ words.push '.'
84
+ end
85
+ words
86
+ end
87
+ # rubocop: enable Layout/ArrayAlignment
88
+
89
+ # Separate punctuation from words, where appropriate. This leaves trailing
90
+ # periods in place to be dealt with later. Called by the clean_text method.
91
+ def split_punct(text)
92
+ # If there's no punctuation, return immediately
93
+ return [text] if /\A\w+\z/ =~ text
94
+
95
+ # Sanity checks
96
+ text = text.gsub(/\W{10,}/o, ' ')
97
+
98
+ # Put quotes into a standard format
99
+ text = text.gsub(/`(?!`)(?=.*\w)/o, '` ') # Shift left quotes off text
100
+ text = text.gsub(/"(?=.*\w)/o, ' `` ') # Convert left quotes to ``
101
+
102
+ # Convert left quote to `
103
+ text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? "#{$1} ` " : ' ` ' }
104
+ text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
105
+
106
+ # Separate right single quotes
107
+ text = text.gsub(/(\w)'(?!')(?=\W|$)/o, "\\1 ' ")
108
+
109
+ # Handle all other punctuation
110
+ text = text.gsub(/--+/o, ' - ') # Convert and separate dashes
111
+ text = text.gsub(/,(?!\d)/o, ' , ') # Shift comma if not following by digit
112
+ text = text.gsub(/:/o, ' :') # Shift semicolon off
113
+ text = text.gsub(/(\.\.\.+)/o, ' \1 ') # Shift ellipses off
114
+ text = text.gsub(/([(\[{}\])])/o, ' \1 ') # Shift off brackets
115
+
116
+ # Shift off other ``standard'' punctuation
117
+ text = text.gsub(/([!?#$%;~|])/o, ' \1 ')
118
+
119
+ # English-specific contractions
120
+ # Separate off 'd 'm 's
121
+ text = text.gsub(/([A-Za-z])'([dms])\b/o, "\\1 '\\2")
122
+ text = text.gsub(/n't\b/o, " n't") # Separate off n't
123
+ text = text.gsub(/'(ve|ll|re)\b/o, " '\\1") # Separate off 've, 'll, 're
124
+ result = text.split
125
+ return result
126
+ end
127
+
128
+
129
+ # Instantiate a facade object as our Rley interface
130
+ nlp_engine = Rley::Engine.new
131
+
132
+ # Now build a very simplified English grammar...
133
+ nlp_engine.build_grammar do
134
+ # Terminals have same names as POS tags returned by Engtagger
135
+ add_terminals('NN', 'NNP')
136
+ add_terminals('DET', 'IN', 'VBD')
137
+
138
+ # Here we define the productions (= grammar rules)
139
+ rule 'S' => %w[NP VP]
140
+ rule 'NP' => 'NNP'
141
+ rule 'NP' => %w[DET NN]
142
+ rule 'NP' => %w[DET NN PP]
143
+ rule 'VP' => %w[VBD NP]
144
+ rule 'VP' => %w[VBD NP PP]
145
+ rule 'PP' => %w[IN NP]
146
+ end
147
+
148
+ # text = "Yo I'm not done with you"
149
+ text = 'John saw Mary with a telescope'
150
+ puts "Input text --> #{text}"
151
+
152
+ tgr = EngTagger.new
153
+
154
+ # Generate raw POS output
155
+ tagged = tgr.add_tags(text)
156
+
157
+ # Generte tokenied lexicon of input text
158
+ # Instead of creating a lexicon dictionary,
159
+ # we would simply generate one each time on the fly for the current text only.
160
+ lexicon = clean_text(text)
161
+
162
+ # Convert EngTagger POS tokens in [[word, pos], ..] format
163
+ tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
164
+
165
+ def tokenizer(lexicon, tokens)
166
+ pos = -1
167
+ rley_tokens = []
168
+ lexicon.each_with_index do |word, i|
169
+ term_name = tokens[i].last
170
+ rank = Rley::Lexical::Position.new(1, pos + 1)
171
+ pos += word.length + 1 # Assuming one space between words.
172
+ rley_tokens << Rley::Lexical::Token.new(word, term_name, rank)
173
+ end
174
+ return rley_tokens
175
+ end
176
+
177
+ # Convert input text into a sequence of rley token objects...
178
+ rley_tokens = tokenizer(lexicon, tokens)
179
+
180
+ # Let Rley grok the tokens
181
+ result = nlp_engine.parse(rley_tokens)
182
+
183
+ puts "Parsing successful? #{result.success?}" # => Parsing successful? true
184
+ puts result.failure_reason.message unless result.success?
185
+
186
+ ptree = nlp_engine.convert(result)
187
+
188
+ visitor = nlp_engine.ptree_visitor(ptree)
189
+
190
+ renderer = Rley::Formatter::Asciitree.new($stdout)
191
+
192
+ # Let's visualize the parse tree (in text format...)
193
+ puts renderer.render(visitor)