rley 0.7.03 → 0.7.08

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +362 -62
  3. data/.travis.yml +6 -7
  4. data/CHANGELOG.md +20 -1
  5. data/LICENSE.txt +1 -1
  6. data/README.md +6 -7
  7. data/Rakefile +2 -0
  8. data/appveyor.yml +2 -4
  9. data/examples/NLP/benchmark_pico_en.rb +2 -0
  10. data/examples/NLP/engtagger.rb +193 -188
  11. data/examples/NLP/nano_eng/nano_en_demo.rb +2 -0
  12. data/examples/NLP/nano_eng/nano_grammar.rb +7 -5
  13. data/examples/NLP/pico_en_demo.rb +2 -0
  14. data/examples/data_formats/JSON/cli_options.rb +3 -1
  15. data/examples/data_formats/JSON/json_ast_builder.rb +14 -9
  16. data/examples/data_formats/JSON/json_ast_nodes.rb +14 -21
  17. data/examples/data_formats/JSON/json_demo.rb +2 -0
  18. data/examples/data_formats/JSON/json_grammar.rb +4 -2
  19. data/examples/data_formats/JSON/json_lexer.rb +10 -8
  20. data/examples/data_formats/JSON/json_minifier.rb +3 -1
  21. data/examples/general/calc_iter1/calc_ast_builder.rb +15 -10
  22. data/examples/general/calc_iter1/calc_ast_nodes.rb +25 -37
  23. data/examples/general/calc_iter1/calc_demo.rb +2 -0
  24. data/examples/general/calc_iter1/calc_grammar.rb +4 -2
  25. data/examples/general/calc_iter1/calc_lexer.rb +8 -4
  26. data/examples/general/calc_iter1/spec/calculator_spec.rb +7 -5
  27. data/examples/general/calc_iter2/calc_ast_builder.rb +7 -3
  28. data/examples/general/calc_iter2/calc_ast_nodes.rb +29 -43
  29. data/examples/general/calc_iter2/calc_demo.rb +2 -0
  30. data/examples/general/calc_iter2/calc_grammar.rb +5 -3
  31. data/examples/general/calc_iter2/calc_lexer.rb +13 -10
  32. data/examples/general/calc_iter2/spec/calculator_spec.rb +28 -26
  33. data/examples/general/left.rb +4 -2
  34. data/examples/general/right.rb +4 -2
  35. data/lib/rley.rb +2 -0
  36. data/lib/rley/base/base_parser.rb +2 -0
  37. data/lib/rley/base/dotted_item.rb +38 -41
  38. data/lib/rley/base/grm_items_builder.rb +2 -0
  39. data/lib/rley/constants.rb +5 -3
  40. data/lib/rley/engine.rb +22 -24
  41. data/lib/rley/formatter/asciitree.rb +6 -4
  42. data/lib/rley/formatter/base_formatter.rb +2 -0
  43. data/lib/rley/formatter/bracket_notation.rb +3 -8
  44. data/lib/rley/formatter/debug.rb +8 -6
  45. data/lib/rley/formatter/json.rb +4 -2
  46. data/lib/rley/gfg/call_edge.rb +3 -1
  47. data/lib/rley/gfg/edge.rb +7 -5
  48. data/lib/rley/gfg/end_vertex.rb +4 -6
  49. data/lib/rley/gfg/epsilon_edge.rb +3 -5
  50. data/lib/rley/gfg/grm_flow_graph.rb +31 -25
  51. data/lib/rley/gfg/item_vertex.rb +12 -22
  52. data/lib/rley/gfg/non_terminal_vertex.rb +6 -4
  53. data/lib/rley/gfg/return_edge.rb +2 -0
  54. data/lib/rley/gfg/scan_edge.rb +3 -1
  55. data/lib/rley/gfg/shortcut_edge.rb +4 -2
  56. data/lib/rley/gfg/start_vertex.rb +6 -8
  57. data/lib/rley/gfg/vertex.rb +47 -41
  58. data/lib/rley/lexical/token.rb +3 -1
  59. data/lib/rley/lexical/token_range.rb +8 -6
  60. data/lib/rley/parse_forest_visitor.rb +7 -5
  61. data/lib/rley/parse_rep/ast_base_builder.rb +11 -11
  62. data/lib/rley/parse_rep/cst_builder.rb +7 -4
  63. data/lib/rley/parse_rep/parse_forest_builder.rb +36 -25
  64. data/lib/rley/parse_rep/parse_forest_factory.rb +5 -3
  65. data/lib/rley/parse_rep/parse_rep_creator.rb +18 -13
  66. data/lib/rley/parse_rep/parse_tree_builder.rb +15 -15
  67. data/lib/rley/parse_rep/parse_tree_factory.rb +27 -25
  68. data/lib/rley/parse_tree_visitor.rb +3 -1
  69. data/lib/rley/parser/error_reason.rb +9 -8
  70. data/lib/rley/parser/gfg_chart.rb +54 -22
  71. data/lib/rley/parser/gfg_earley_parser.rb +3 -1
  72. data/lib/rley/parser/gfg_parsing.rb +51 -31
  73. data/lib/rley/parser/parse_entry.rb +29 -33
  74. data/lib/rley/parser/parse_entry_set.rb +32 -27
  75. data/lib/rley/parser/parse_entry_tracker.rb +6 -4
  76. data/lib/rley/parser/parse_state.rb +18 -21
  77. data/lib/rley/parser/parse_state_tracker.rb +6 -4
  78. data/lib/rley/parser/parse_tracer.rb +15 -13
  79. data/lib/rley/parser/parse_walker_factory.rb +28 -29
  80. data/lib/rley/parser/state_set.rb +11 -10
  81. data/lib/rley/ptree/non_terminal_node.rb +10 -6
  82. data/lib/rley/ptree/parse_tree.rb +6 -4
  83. data/lib/rley/ptree/parse_tree_node.rb +7 -5
  84. data/lib/rley/ptree/terminal_node.rb +9 -7
  85. data/lib/rley/rley_error.rb +12 -10
  86. data/lib/rley/sppf/alternative_node.rb +8 -6
  87. data/lib/rley/sppf/composite_node.rb +9 -7
  88. data/lib/rley/sppf/epsilon_node.rb +5 -3
  89. data/lib/rley/sppf/leaf_node.rb +5 -3
  90. data/lib/rley/sppf/non_terminal_node.rb +2 -0
  91. data/lib/rley/sppf/parse_forest.rb +19 -17
  92. data/lib/rley/sppf/sppf_node.rb +9 -8
  93. data/lib/rley/sppf/token_node.rb +5 -3
  94. data/lib/rley/syntax/grammar.rb +7 -5
  95. data/lib/rley/syntax/grammar_builder.rb +11 -9
  96. data/lib/rley/syntax/grm_symbol.rb +8 -6
  97. data/lib/rley/syntax/literal.rb +2 -0
  98. data/lib/rley/syntax/non_terminal.rb +11 -15
  99. data/lib/rley/syntax/production.rb +13 -11
  100. data/lib/rley/syntax/symbol_seq.rb +10 -10
  101. data/lib/rley/syntax/terminal.rb +6 -5
  102. data/lib/rley/syntax/verbatim_symbol.rb +5 -3
  103. data/lib/support/base_tokenizer.rb +23 -20
  104. data/spec/rley/base/dotted_item_spec.rb +4 -2
  105. data/spec/rley/base/grm_items_builder_spec.rb +2 -0
  106. data/spec/rley/engine_spec.rb +47 -9
  107. data/spec/rley/formatter/asciitree_spec.rb +11 -9
  108. data/spec/rley/formatter/bracket_notation_spec.rb +16 -14
  109. data/spec/rley/formatter/debug_spec.rb +4 -2
  110. data/spec/rley/formatter/json_spec.rb +5 -3
  111. data/spec/rley/gfg/call_edge_spec.rb +2 -0
  112. data/spec/rley/gfg/edge_spec.rb +2 -0
  113. data/spec/rley/gfg/end_vertex_spec.rb +7 -5
  114. data/spec/rley/gfg/epsilon_edge_spec.rb +2 -0
  115. data/spec/rley/gfg/grm_flow_graph_spec.rb +2 -0
  116. data/spec/rley/gfg/item_vertex_spec.rb +12 -10
  117. data/spec/rley/gfg/non_terminal_vertex_spec.rb +5 -3
  118. data/spec/rley/gfg/return_edge_spec.rb +2 -0
  119. data/spec/rley/gfg/scan_edge_spec.rb +2 -0
  120. data/spec/rley/gfg/shortcut_edge_spec.rb +3 -1
  121. data/spec/rley/gfg/start_vertex_spec.rb +7 -5
  122. data/spec/rley/gfg/vertex_spec.rb +5 -3
  123. data/spec/rley/lexical/token_range_spec.rb +18 -16
  124. data/spec/rley/lexical/token_spec.rb +4 -2
  125. data/spec/rley/parse_forest_visitor_spec.rb +167 -163
  126. data/spec/rley/parse_rep/ambiguous_parse_spec.rb +46 -44
  127. data/spec/rley/parse_rep/ast_builder_spec.rb +8 -6
  128. data/spec/rley/parse_rep/cst_builder_spec.rb +7 -5
  129. data/spec/rley/parse_rep/groucho_spec.rb +25 -25
  130. data/spec/rley/parse_rep/parse_forest_builder_spec.rb +28 -26
  131. data/spec/rley/parse_rep/parse_forest_factory_spec.rb +8 -6
  132. data/spec/rley/parse_rep/parse_tree_factory_spec.rb +4 -2
  133. data/spec/rley/parse_tree_visitor_spec.rb +12 -8
  134. data/spec/rley/parser/error_reason_spec.rb +8 -6
  135. data/spec/rley/parser/gfg_chart_spec.rb +17 -4
  136. data/spec/rley/parser/gfg_earley_parser_spec.rb +16 -11
  137. data/spec/rley/parser/gfg_parsing_spec.rb +41 -252
  138. data/spec/rley/parser/parse_entry_set_spec.rb +2 -0
  139. data/spec/rley/parser/parse_entry_spec.rb +21 -19
  140. data/spec/rley/parser/parse_state_spec.rb +7 -5
  141. data/spec/rley/parser/parse_tracer_spec.rb +16 -14
  142. data/spec/rley/parser/parse_walker_factory_spec.rb +10 -8
  143. data/spec/rley/parser/state_set_spec.rb +24 -22
  144. data/spec/rley/ptree/non_terminal_node_spec.rb +7 -3
  145. data/spec/rley/ptree/parse_tree_node_spec.rb +6 -4
  146. data/spec/rley/ptree/parse_tree_spec.rb +2 -0
  147. data/spec/rley/ptree/terminal_node_spec.rb +8 -6
  148. data/spec/rley/sppf/alternative_node_spec.rb +8 -6
  149. data/spec/rley/sppf/non_terminal_node_spec.rb +5 -3
  150. data/spec/rley/sppf/token_node_spec.rb +6 -4
  151. data/spec/rley/support/ambiguous_grammar_helper.rb +5 -4
  152. data/spec/rley/support/expectation_helper.rb +2 -0
  153. data/spec/rley/support/grammar_abc_helper.rb +4 -4
  154. data/spec/rley/support/grammar_ambig01_helper.rb +6 -5
  155. data/spec/rley/support/grammar_arr_int_helper.rb +6 -5
  156. data/spec/rley/support/grammar_b_expr_helper.rb +6 -5
  157. data/spec/rley/support/grammar_helper.rb +2 -0
  158. data/spec/rley/support/grammar_l0_helper.rb +15 -16
  159. data/spec/rley/support/grammar_pb_helper.rb +8 -5
  160. data/spec/rley/support/grammar_sppf_helper.rb +3 -1
  161. data/spec/rley/syntax/grammar_builder_spec.rb +7 -5
  162. data/spec/rley/syntax/grammar_spec.rb +8 -6
  163. data/spec/rley/syntax/grm_symbol_spec.rb +3 -1
  164. data/spec/rley/syntax/literal_spec.rb +2 -0
  165. data/spec/rley/syntax/non_terminal_spec.rb +10 -8
  166. data/spec/rley/syntax/production_spec.rb +15 -13
  167. data/spec/rley/syntax/symbol_seq_spec.rb +4 -2
  168. data/spec/rley/syntax/terminal_spec.rb +7 -5
  169. data/spec/rley/syntax/verbatim_symbol_spec.rb +3 -1
  170. data/spec/spec_helper.rb +2 -12
  171. data/spec/support/base_tokenizer_spec.rb +9 -2
  172. metadata +21 -63
  173. data/.simplecov +0 -7
  174. data/Gemfile +0 -8
data/.travis.yml CHANGED
@@ -9,17 +9,16 @@ script:
9
9
  - bundle exec rake
10
10
 
11
11
  rvm:
12
- - 2.6.0
13
- - 2.5.3
14
- - 2.4.5
15
- - 2.3.8
12
+ - 2.7.1
13
+ - 2.6.6
14
+ - 2.5.8
15
+ - 2.4.10
16
16
  - ruby-head
17
- - jruby-9.1.9.0
18
- - jruby-head
17
+ - jruby-head
18
+ before_install: gem install bundler -v 2.0.2
19
19
 
20
20
  matrix:
21
21
  allow_failures:
22
- - rvm: 2.6.0
23
22
  - rvm: ruby-head
24
23
  - rvm: jruby-head
25
24
 
data/CHANGELOG.md CHANGED
@@ -1,10 +1,29 @@
1
+ ### 0.7.07 / 2020-11-16
2
+ - Code restyling to please rubocop 0.93.1
3
+ * [CHANGE] File `.travis.yml`: updated Ruby versions, drop support for Ruby 2.3.x
4
+
5
+ ### 0.7.06 / 2019-11-22
6
+ - [FIX] Method `ParseForestBuilder#process_end_entry`: Added a guard expression to prevent nil error.
7
+
8
+ ### 0.7.05 / 2019-11-17
9
+ - [FIX] Method `GFGParsing#nullable_rule`: issue with nullable productions having at least one member in their rhs.
10
+
11
+ ### 0.7.04 / 2019-08-17
12
+ - Rley recognizer is about 25% faster than previous version. Kudos to the people
13
+ behind the *magic_frozen_string_literal* gem.
14
+ - Code refactoring to use string frozen magic comments (as a consequence, Rley runs only on Rubies 2.3 or newer).
15
+ - Code restyling to please rubocop 0.7.40.
16
+ - [CHANGE] Class `ParseEntrySet`: minor code optimization
17
+ - [CHANGE] File `README.md` removed allusion to Ruby 2.0.x up to 2.2.x.
18
+ - [CHANGE] File `README.md` added Ruby 2.6.x up as supported version.
19
+
20
+
1
21
  ### 0.7.03 / 2019-07-21
2
22
  - Minor refactoring for parsing speed improvement.
3
23
  * [CHANGE] Replacing comparison with object_id by `equal?` method
4
24
  * [CHANGE] Method `ParseEntry#hash` added
5
25
  * [CHANGE] Class `ParseEntrySet` added new Hash attribute in order to speed up membership testing.
6
26
 
7
-
8
27
  ### 0.7.02 / 2019-01-13
9
28
  - Removed Ruby versions older than 2.3 in CI testing because of breaking changes by Bundler 2.0
10
29
  * [CHANGE] Files `Gemfile`, `.travis.yml`, `appveyor.yml` updated.
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2014-2019 Dimitri Geshef
1
+ Copyright (c) 2014-2021 Dimitri Geshef
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining a copy
4
4
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -50,13 +50,11 @@ application range such as:
50
50
 
51
51
 
52
52
  ### Compatibility
53
- Rley supports the following Ruby implementations:
54
- - MRI 2.0
55
- - MRI 2.1
56
- - MRI 2.2
53
+ Rley supports the following Ruby implementations:
57
54
  - MRI 2.3
58
55
  - MRI 2.4
59
- - MRI 2.5
56
+ - MRI 2.5
57
+ - MRI 2.6
60
58
  - JRuby 9.1+
61
59
 
62
60
  ---
@@ -92,7 +90,7 @@ directory
92
90
 
93
91
  # Let's create a facade object called 'engine'
94
92
  # It provides a unified, higher-level interface
95
- engine = Rley.Engine.new
93
+ engine = Rley::Engine.new
96
94
  ```
97
95
 
98
96
 
@@ -448,6 +446,7 @@ actively curated by Andrei Beliankou (aka arbox).
448
446
  ## Thanks to:
449
447
  * Professor Keshav Pingali, one of the creators of the Grammar Flow Graph parsing approach for his encouraging e-mail exchange.
450
448
  * [Arjun Menon](https://github.com/arjunmenon) for his NLP example that uses `engtagger` gem.
449
+ * [Gui Heurich](https://github.com/GuiHeurich) for spotting a mistake in the code sample in `README` file.
451
450
 
452
451
  ## Grammar Flow Graph
453
452
  Since the Grammar Flow Graph parsing approach is quite new, it has not yet taken a place in
@@ -460,5 +459,5 @@ standard parser textbooks. Here are a few references (and links) of papers on GF
460
459
 
461
460
  Copyright
462
461
  ---------
463
- Copyright (c) 2014-2018, Dimitri Geshef.
462
+ Copyright (c) 2014-2020, Dimitri Geshef.
464
463
  __Rley__ is released under the MIT License see [LICENSE.txt](https://github.com/famished-tiger/Rley/blob/master/LICENSE.txt) for details.
data/Rakefile CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'rubygems'
2
4
  require_relative './lib/rley/constants'
3
5
 
data/appveyor.yml CHANGED
@@ -2,17 +2,15 @@ version: '{build}'
2
2
  max_jobs: 5
3
3
  environment:
4
4
  matrix:
5
+ - Ruby_version: 26-x64
5
6
  - Ruby_version: 25-x64
6
7
  - Ruby_version: 24-x64
7
8
  - Ruby_version: 23-x64
9
+ - Ruby_version: 26
8
10
  - Ruby_version: 25
9
11
  - Ruby_version: 24
10
12
  - Ruby_version: 23
11
13
 
12
- # These are failing
13
- # - Ruby_version: 26
14
- # - Ruby_version: 26-x64
15
-
16
14
  install:
17
15
  - set PATH=C:\Ruby%Ruby_version%\bin;%PATH%
18
16
  - gem update --system
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # File: benchmark_pico_en.rb
2
4
  # Purpose: benchmark the parse speed
3
5
  require 'benchmark'
@@ -1,188 +1,193 @@
1
- require 'rley'
2
- require 'engtagger' # Load POS (Part-Of-Speech) tagger EngTagger
3
-
4
- # REGEX to remove XML tags from Engtagger output
5
- GET_TAG = /<(.+?)>(.*?)<.+?>/.freeze
6
-
7
- # Text tokenizer
8
- # Taken directly from Engtagger, will ensure uniform indexing while parsing
9
- def clean_text(text)
10
- return false unless valid_text(text)
11
-
12
- text = text.toutf8
13
- cleaned_text = text
14
- tokenized = []
15
- # Tokenize the text (splitting on punctuation as you go)
16
- cleaned_text.split(/\s+/).each do |line|
17
- tokenized += split_punct(line)
18
- end
19
- words = split_sentences(tokenized)
20
- return words
21
- end
22
-
23
- def valid_text(text)
24
- if !text
25
- # there's nothing to parse
26
- puts 'method call on uninitialized variable'
27
- return false
28
- elsif /\A\s*\z/ =~ text
29
- # text is an empty string, nothing to parse
30
- return false
31
- else
32
- # $text is valid
33
- return true
34
- end
35
- end
36
-
37
- def split_sentences(array)
38
- tokenized = array
39
- people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
40
- supt det mssrs rev]
41
- army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
42
- inst = %w[dept univ assn bros ph.d]
43
- place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
44
- hwy hway la pde pd plz pl rd st tce]
45
- comp = %w[mfg inc ltd co corp]
46
- state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
47
- ind ia kans kan ken ky la me md is mass mich minn miss mo mont
48
- neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
49
- va wash wis wisc wy wyo usafa alta man ont que sask yuk]
50
- month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
51
- misc = %w[vs etc no esp]
52
- abbr = {}
53
- [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
54
- abbr[i] = true
55
- end
56
- words = []
57
- tokenized.each_with_index do |_t, i|
58
- if tokenized[i + 1] &&
59
- tokenized [i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
60
- w = $1
61
- # Don't separate the period off words that
62
- # meet any of the following conditions:
63
- #
64
- # 1. It is defined in one of the lists above
65
- # 2. It is only one letter long: Alfred E. Sloan
66
- # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
67
- unless abbr[w.downcase] ||
68
- w =~ /\A[a-z]\z/i || w =~ /[a-z](?:\.[a-z])+\z/i
69
- words << w
70
- words << '.'
71
- next
72
- end
73
- end
74
- words << tokenized[i]
75
- end
76
- # If the final word ends in a period..
77
- if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
78
- words[-1] = $1
79
- words.push '.'
80
- end
81
- return words
82
- end
83
-
84
- # Separate punctuation from words, where appropriate. This leaves trailing
85
- # periods in place to be dealt with later. Called by the clean_text method.
86
- def split_punct(text)
87
- # If there's no punctuation, return immediately
88
- return [text] if /\A\w+\z/ =~ text
89
-
90
- # Sanity checks
91
- text = text.gsub(/\W{10,}/o, ' ')
92
-
93
- # Put quotes into a standard format
94
- text = text.gsub(/`(?!`)(?=.*\w)/o, '` ') # Shift left quotes off text
95
- text = text.gsub(/"(?=.*\w)/o, ' `` ') # Convert left quotes to ``
96
-
97
- # Convert left quote to `
98
- text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? $1 + ' ` ' : ' ` ' }
99
- text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
100
-
101
- # Separate right single quotes
102
- text = text.gsub(/(\w)'(?!')(?=\W|$)/o, "\\1 ' ")
103
-
104
- # Handle all other punctuation
105
- text = text.gsub(/--+/o, ' - ') # Convert and separate dashes
106
- text = text.gsub(/,(?!\d)/o, ' , ') # Shift comma if not following by digit
107
- text = text.gsub(/:/o, ' :') # Shift semicolon off
108
- text = text.gsub(/(\.\.\.+)/o, ' \1 ') # Shift ellipses off
109
- text = text.gsub(/([\(\[\{\}\]\)])/o, ' \1 ') # Shift off brackets
110
-
111
- # Shift off other ``standard'' punctuation
112
- text = text.gsub(/([\!\?#\$%;~|])/o, ' \1 ')
113
-
114
- # English-specific contractions
115
- # Separate off 'd 'm 's
116
- text = text.gsub(/([A-Za-z])'([dms])\b/o, "\\1 '\\2")
117
- text = text.gsub(/n't\b/o, " n't") # Separate off n't
118
- text = text.gsub(/'(ve|ll|re)\b/o, " '\\1") # Separate off 've, 'll, 're
119
- result = text.split(' ')
120
- return result
121
- end
122
-
123
-
124
- # Instantiate a facade object as our Rley interface
125
- nlp_engine = Rley::Engine.new
126
-
127
- # Now build a very simplified English grammar...
128
- nlp_engine.build_grammar do
129
- # Terminals have same names as POS tags returned by Engtagger
130
- add_terminals('NN', 'NNP')
131
- add_terminals('DET', 'IN', 'VBD')
132
-
133
- # Here we define the productions (= grammar rules)
134
- rule 'S' => %w[NP VP]
135
- rule 'NP' => 'NNP'
136
- rule 'NP' => %w[DET NN]
137
- rule 'NP' => %w[DET NN PP]
138
- rule 'VP' => %w[VBD NP]
139
- rule 'VP' => %w[VBD NP PP]
140
- rule 'PP' => %w[IN NP]
141
- end
142
-
143
- # text = "Yo I'm not done with you"
144
- text = 'John saw Mary with a telescope'
145
- puts "Input text --> #{text}"
146
-
147
- tgr = EngTagger.new
148
-
149
- # Generate raw POS output
150
- tagged = tgr.add_tags(text)
151
-
152
- # Generte tokenied lexicon of input text
153
- # Instead of creating a lexicon dictionary,
154
- # we would simply generate one each time on the fly for the current text only.
155
- lexicon = clean_text(text)
156
-
157
- # Convert EngTagger POS tokens in [[word, pos], ..] format
158
- tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
159
-
160
- def tokenizer(lexicon, tokens)
161
- pos = -1
162
- rley_tokens = []
163
- lexicon.each_with_index do |word, i|
164
- term_name = tokens[i].last
165
- rank = Rley::Lexical::Position.new(1, pos + 1)
166
- pos += word.length + 1 # Assuming one space between words.
167
- rley_tokens << Rley::Lexical::Token.new(word, term_name, rank)
168
- end
169
- return rley_tokens
170
- end
171
-
172
- # Convert input text into a sequence of rley token objects...
173
- rley_tokens = tokenizer(lexicon, tokens)
174
-
175
- # Let Rley grok the tokens
176
- result = nlp_engine.parse(rley_tokens)
177
-
178
- puts "Parsing successful? #{result.success?}" # => Parsing successful? true
179
- puts result.failure_reason.message unless result.success?
180
-
181
- ptree = nlp_engine.convert(result)
182
-
183
- visitor = nlp_engine.ptree_visitor(ptree)
184
-
185
- renderer = Rley::Formatter::Asciitree.new($stdout)
186
-
187
- # Let's visualize the parse tree (in text format...)
188
- puts renderer.render(visitor)
1
+ # frozen_string_literal: true
2
+
3
+ require 'rley'
4
+ require 'engtagger' # Load POS (Part-Of-Speech) tagger EngTagger
5
+
6
+ # REGEX to remove XML tags from Engtagger output
7
+ GET_TAG = /<(.+?)>(.*?)<.+?>/.freeze
8
+
9
+ # Text tokenizer
10
+ # Taken directly from Engtagger, will ensure uniform indexing while parsing
11
+ def clean_text(text)
12
+ return false unless valid_text(text)
13
+
14
+ text = text.toutf8
15
+ cleaned_text = text
16
+ tokenized = []
17
+ # Tokenize the text (splitting on punctuation as you go)
18
+ cleaned_text.split(/\s+/).each do |line|
19
+ tokenized += split_punct(line)
20
+ end
21
+ words = split_sentences(tokenized)
22
+ return words
23
+ end
24
+
25
+ def valid_text(text)
26
+ if !text
27
+ # there's nothing to parse
28
+ puts 'method call on uninitialized variable'
29
+ return false
30
+ elsif /\A\s*\z/ =~ text
31
+ # text is an empty string, nothing to parse
32
+ return false
33
+ else
34
+ # $text is valid
35
+ return true
36
+ end
37
+ end
38
+
39
+ def split_sentences(array)
40
+ # rubocop: disable Layout/ArrayAlignment
41
+ tokenized = array
42
+ people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
43
+ supt det mssrs rev]
44
+ army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
45
+ inst = %w[dept univ assn bros ph.d]
46
+ place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
47
+ hwy hway la pde pd plz pl rd st tce]
48
+ comp = %w[mfg inc ltd co corp]
49
+ state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
50
+ ind ia kans kan ken ky la me md is mass mich minn miss mo mont
51
+ neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
52
+ va wash wis wisc wy wyo usafa alta man ont que sask yuk]
53
+ month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
54
+ misc = %w[vs etc no esp]
55
+ abbr = {}
56
+ [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
57
+ abbr[i] = true
58
+ end
59
+ words = []
60
+ tokenized.each_with_index do |_t, i|
61
+ if tokenized[i + 1] &&
62
+ tokenized[i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
63
+ w = $1
64
+ # Don't separate the period off words that
65
+ # meet any of the following conditions:
66
+ #
67
+ # 1. It is defined in one of the lists above
68
+ # 2. It is only one letter long: Alfred E. Sloan
69
+ # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
70
+ unless abbr[w.downcase] ||
71
+ w =~ /\A[a-z]\z/i || w =~ /[a-z](?:\.[a-z])+\z/i
72
+ words << w
73
+ words << '.'
74
+ next
75
+ end
76
+ end
77
+ words << tokenized[i]
78
+ end
79
+
80
+ # If the final word ends in a period..
81
+ if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
82
+ words[-1] = $1
83
+ words.push '.'
84
+ end
85
+ words
86
+ end
87
+ # rubocop: enable Layout/ArrayAlignment
88
+
89
+ # Separate punctuation from words, where appropriate. This leaves trailing
90
+ # periods in place to be dealt with later. Called by the clean_text method.
91
+ def split_punct(text)
92
+ # If there's no punctuation, return immediately
93
+ return [text] if /\A\w+\z/ =~ text
94
+
95
+ # Sanity checks
96
+ text = text.gsub(/\W{10,}/o, ' ')
97
+
98
+ # Put quotes into a standard format
99
+ text = text.gsub(/`(?!`)(?=.*\w)/o, '` ') # Shift left quotes off text
100
+ text = text.gsub(/"(?=.*\w)/o, ' `` ') # Convert left quotes to ``
101
+
102
+ # Convert left quote to `
103
+ text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? "#{$1} ` " : ' ` ' }
104
+ text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
105
+
106
+ # Separate right single quotes
107
+ text = text.gsub(/(\w)'(?!')(?=\W|$)/o, "\\1 ' ")
108
+
109
+ # Handle all other punctuation
110
+ text = text.gsub(/--+/o, ' - ') # Convert and separate dashes
111
+ text = text.gsub(/,(?!\d)/o, ' , ') # Shift comma if not following by digit
112
+ text = text.gsub(/:/o, ' :') # Shift semicolon off
113
+ text = text.gsub(/(\.\.\.+)/o, ' \1 ') # Shift ellipses off
114
+ text = text.gsub(/([(\[{}\])])/o, ' \1 ') # Shift off brackets
115
+
116
+ # Shift off other ``standard'' punctuation
117
+ text = text.gsub(/([!?#$%;~|])/o, ' \1 ')
118
+
119
+ # English-specific contractions
120
+ # Separate off 'd 'm 's
121
+ text = text.gsub(/([A-Za-z])'([dms])\b/o, "\\1 '\\2")
122
+ text = text.gsub(/n't\b/o, " n't") # Separate off n't
123
+ text = text.gsub(/'(ve|ll|re)\b/o, " '\\1") # Separate off 've, 'll, 're
124
+ result = text.split
125
+ return result
126
+ end
127
+
128
+
129
+ # Instantiate a facade object as our Rley interface
130
+ nlp_engine = Rley::Engine.new
131
+
132
+ # Now build a very simplified English grammar...
133
+ nlp_engine.build_grammar do
134
+ # Terminals have same names as POS tags returned by Engtagger
135
+ add_terminals('NN', 'NNP')
136
+ add_terminals('DET', 'IN', 'VBD')
137
+
138
+ # Here we define the productions (= grammar rules)
139
+ rule 'S' => %w[NP VP]
140
+ rule 'NP' => 'NNP'
141
+ rule 'NP' => %w[DET NN]
142
+ rule 'NP' => %w[DET NN PP]
143
+ rule 'VP' => %w[VBD NP]
144
+ rule 'VP' => %w[VBD NP PP]
145
+ rule 'PP' => %w[IN NP]
146
+ end
147
+
148
+ # text = "Yo I'm not done with you"
149
+ text = 'John saw Mary with a telescope'
150
+ puts "Input text --> #{text}"
151
+
152
+ tgr = EngTagger.new
153
+
154
+ # Generate raw POS output
155
+ tagged = tgr.add_tags(text)
156
+
157
+ # Generte tokenied lexicon of input text
158
+ # Instead of creating a lexicon dictionary,
159
+ # we would simply generate one each time on the fly for the current text only.
160
+ lexicon = clean_text(text)
161
+
162
+ # Convert EngTagger POS tokens in [[word, pos], ..] format
163
+ tokens = tagged.scan(GET_TAG).map { |tag, word| [word, tag.upcase] }
164
+
165
+ def tokenizer(lexicon, tokens)
166
+ pos = -1
167
+ rley_tokens = []
168
+ lexicon.each_with_index do |word, i|
169
+ term_name = tokens[i].last
170
+ rank = Rley::Lexical::Position.new(1, pos + 1)
171
+ pos += word.length + 1 # Assuming one space between words.
172
+ rley_tokens << Rley::Lexical::Token.new(word, term_name, rank)
173
+ end
174
+ return rley_tokens
175
+ end
176
+
177
+ # Convert input text into a sequence of rley token objects...
178
+ rley_tokens = tokenizer(lexicon, tokens)
179
+
180
+ # Let Rley grok the tokens
181
+ result = nlp_engine.parse(rley_tokens)
182
+
183
+ puts "Parsing successful? #{result.success?}" # => Parsing successful? true
184
+ puts result.failure_reason.message unless result.success?
185
+
186
+ ptree = nlp_engine.convert(result)
187
+
188
+ visitor = nlp_engine.ptree_visitor(ptree)
189
+
190
+ renderer = Rley::Formatter::Asciitree.new($stdout)
191
+
192
+ # Let's visualize the parse tree (in text format...)
193
+ puts renderer.render(visitor)