rley 0.5.07 → 0.5.08

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/examples/NLP/{benchmark_mini_en.rb → benchmark_pico_en.rb} +0 -0
  4. data/examples/NLP/nano_eng/nano_en_demo.rb +118 -0
  5. data/examples/NLP/nano_eng/nano_grammar.rb +59 -0
  6. data/examples/NLP/{mini_en_demo.rb → pico_en_demo.rb} +2 -2
  7. data/examples/general/SRL/lib/ast_builder.rb +176 -0
  8. data/examples/general/SRL/lib/ast_building.rb +20 -0
  9. data/examples/general/SRL/lib/grammar.rb +32 -0
  10. data/examples/general/SRL/lib/parser.rb +26 -0
  11. data/examples/general/SRL/lib/regex/multiplicity.rb +94 -0
  12. data/examples/general/SRL/lib/regex_repr.rb +1 -0
  13. data/examples/general/SRL/lib/srl_demo.rb +67 -0
  14. data/examples/general/SRL/lib/tokenizer.rb +101 -0
  15. data/examples/general/SRL/spec/integration_spec.rb +103 -0
  16. data/examples/general/SRL/spec/regex/multiplicity_spec.rb +83 -0
  17. data/examples/general/SRL/spec/spec_helper.rb +25 -0
  18. data/examples/general/SRL/spec/tokenizer_spec.rb +125 -0
  19. data/examples/general/SRL/srl_demo.rb +57 -0
  20. data/examples/general/calc_iter1/calc_demo.rb +1 -1
  21. data/examples/general/calc_iter2/ast_building.rb +20 -0
  22. data/examples/general/calc_iter2/calc_ast_builder.rb +3 -23
  23. data/examples/general/calc_iter2/calc_demo.rb +1 -1
  24. data/lib/rley/base/base_parser.rb +1 -1
  25. data/lib/rley/base/grm_items_builder.rb +1 -1
  26. data/lib/rley/constants.rb +1 -1
  27. data/lib/rley/gfg/non_terminal_vertex.rb +1 -1
  28. data/lib/rley/parser/gfg_chart.rb +8 -3
  29. data/lib/rley/parser/gfg_earley_parser.rb +5 -2
  30. data/lib/rley/parser/gfg_parsing.rb +5 -1
  31. data/lib/rley/parser/parse_tree_builder.rb +16 -5
  32. data/lib/rley/ptree/terminal_node.rb +3 -2
  33. data/spec/rley/parser/ast_builder_spec.rb +2 -2
  34. data/spec/rley/parser/cst_builder_spec.rb +2 -3
  35. metadata +20 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4ce368c99ffa556898d9a89788786e47b5e8b115
4
- data.tar.gz: 97a691b869089989f601556ed13dd24d729b7ad0
3
+ metadata.gz: 3c616b691fb51ba2eb00a25fee75ff4a80093990
4
+ data.tar.gz: 1039cfe8f29c8d1ec7c88fa83c18f9173763b8f2
5
5
  SHA512:
6
- metadata.gz: b50595273bf5f75e25b6e609de197be31a4f8583f1c1f93ef8494a8e5367facdbafd13326bd2097c9da657096c2f15e9dc8d439d16212f4e578134ce538984b2
7
- data.tar.gz: 8be6b6bdef48de1cbd19b2785530a5bc17e43804a43ff01f287a89a129a83897f8ea05ecb951018c99f888bbfefbbc33e89ee9cfb0adb2e86eba2e3971703c1d
6
+ metadata.gz: df7412344421bd421fb459fe5cf8053618dea1212c4da27e83cf41225dbaf664d9b143499978e6bcef2ae293a7bf9378d3ecb4867f989553f798e9723ba8344b
7
+ data.tar.gz: 436474ceafd2689137fab890b19ca24715ebe72dd1311b3ad64313bc130cf8bbce12fe35049008d20a89634309cbac882da70bc891522d45a58e8ce310b466a7
@@ -1,3 +1,9 @@
1
+ ### 0.5.08 / 2017-11-xx
2
+ * [FIX] Method `BaseParser::initialize` missing parameter name in doc caused a YARD warning.
3
+ * [FIX] Method `GrmItemsBuilder::build_dotted_items` missing parameter name in doc caused a YARD warning.
4
+ * [FIX] Method `NonTerminalVertex::initialize` missing parameter name in doc caused a YARD warning.
5
+
6
+
1
7
  ### 0.5.07 / 2017-11-11
2
8
  * [NEW] File `benchmark_mini_en.rb` added in `examples/NLP` folder for parsing performance measurements.
3
9
  * [CHANGE] Demo calculator in `examples/general/calc_iter2`: added support for log10 and cbrt functions. README.md slightly reworked.
@@ -0,0 +1,118 @@
1
+ require 'rley' # Load Rley library
2
+
3
+ ########################################
4
+ # Step 1. Define a grammar for a nano English-like language
5
+ # based on example from Jurafski & Martin book (chapter 8 of the book).
6
+ # Bird, Steven, Edward Loper and Ewan Klein: "Speech and Language Processing";
7
+ # 2009, Pearson Education, Inc., ISBN 978-0135041963
8
+ # It defines the syntax of a sentence in a mini English-like language
9
+ # with a very simplified syntax and vocabulary
10
+
11
+ # Instantiate a builder object that will build the grammar for us
12
+ builder = Rley::Syntax::GrammarBuilder.new do
13
+ # Next 2 lines we define the terminal symbols
14
+ # (= word categories in the lexicon)
15
+ add_terminals('Noun', 'Proper-Noun', 'Pronoun', 'Verb')
16
+ add_terminals('Aux', 'Det', 'Preposition')
17
+
18
+ # Here we define the productions (= grammar rules)
19
+ rule 'Start' => 'S'
20
+ rule 'S' => %w[NP VP]
21
+ rule 'S' => %w[Aux NP VP]
22
+ rule 'S' => 'VP'
23
+ rule 'NP' => 'Pronoun'
24
+ rule 'NP' => 'Proper-Noun'
25
+ rule 'NP' => %w[Det Nominal]
26
+ rule 'Nominal' => %[Noun]
27
+ rule 'Nominal' => %[Nominal Noun]
28
+ rule 'VP' => 'Verb'
29
+ rule 'VP' => %w[Verb NP]
30
+ rule 'VP' => %w[Verb NP PP]
31
+ rule 'VP' => %w[Verb PP]
32
+ rule 'VP' => %w[VP PP]
33
+ rule 'PP' => %w[Preposition NP]
34
+ end
35
+
36
+ # And now, let's build the grammar...
37
+ grammar = builder.grammar
38
+
39
+ ########################################
40
+ # Step 2. Creating a lexicon
41
+ # To simplify things, lexicon is implemented as a Hash with pairs of the form:
42
+ # word => terminal symbol name
43
+ Lexicon = {
44
+ 'man' => 'Noun',
45
+ 'dog' => 'Noun',
46
+ 'cat' => 'Noun',
47
+ 'telescope' => 'Noun',
48
+ 'park' => 'Noun',
49
+ 'saw' => 'Verb',
50
+ 'ate' => 'Verb',
51
+ 'walked' => 'Verb',
52
+ 'John' => 'Proper-Noun',
53
+ 'Mary' => 'Proper-Noun',
54
+ 'Bob' => 'Proper-Noun',
55
+ 'a' => 'Determiner',
56
+ 'an' => 'Determiner',
57
+ 'the' => 'Determiner',
58
+ 'my' => 'Determiner',
59
+ 'in' => 'Preposition',
60
+ 'on' => 'Preposition',
61
+ 'by' => 'Preposition',
62
+ 'with' => 'Preposition'
63
+ }.freeze
64
+
65
+ ########################################
66
+ # Step 3. Creating a tokenizer
67
+ # A tokenizer reads the input string and converts it into a sequence of tokens
68
+ # Highly simplified tokenizer implementation.
69
+ def tokenizer(aTextToParse, aGrammar)
70
+ tokens = aTextToParse.scan(/\S+/).map do |word|
71
+ term_name = Lexicon[word]
72
+ raise StandardError, "Word '#{word}' not found in lexicon" if term_name.nil?
73
+ terminal = aGrammar.name2symbol[term_name]
74
+ Rley::Lexical::Token.new(word, terminal)
75
+ end
76
+
77
+ return tokens
78
+ end
79
+
80
+ ########################################
81
+ # Step 4. Create a parser for that grammar
82
+ # Easy with Rley...
83
+ parser = Rley::Parser::GFGEarleyParser.new(grammar)
84
+
85
+ ########################################
86
+ # Step 5. Parsing the input
87
+ input_to_parse = 'John saw Mary with a telescope'
88
+ # input_to_parse = 'the dog saw a man in the park' # This one is ambiguous
89
+ # Convert input text into a sequence of token objects...
90
+ tokens = tokenizer(input_to_parse, grammar)
91
+ result = parser.parse(tokens)
92
+
93
+ puts "Parsing successful? #{result.success?}"
94
+ unless result.success?
95
+ puts result.failure_reason.message
96
+ exit(1)
97
+ end
98
+
99
+ ########################################
100
+ # Step 6. Generating a parse tree from parse result
101
+ ptree = result.parse_tree
102
+
103
+ # Let's create a parse tree visitor
104
+ visitor = Rley::ParseTreeVisitor.new(ptree)
105
+
106
+ # Let's create a formatter (i.e. visit event listener)
107
+ # renderer = Rley::Formatter::Debug.new($stdout)
108
+
109
+ # Let's create a formatter that will render the parse tree with characters
110
+ renderer = Rley::Formatter::Asciitree.new($stdout)
111
+
112
+ # Let's create a formatter that will render the parse tree in labelled
113
+ # bracket notation
114
+ # renderer = Rley::Formatter::BracketNotation.new($stdout)
115
+
116
+ # Subscribe the formatter to the visitor's event and launch the visit
117
+ renderer.render(visitor)
118
+ # End of file
@@ -0,0 +1,59 @@
1
+ # Grammar for a simple subset of English language
2
+ # It is called nano-English because it has a more elaborate
3
+ # grammar than pico-English but remains still tiny compared to "real" English
4
+ require 'rley' # Load the gem
5
+
6
+
7
+ ########################################
8
+ # Define a grammar for a nano English-like language
9
+ # based on chapter 12 from Jurafski & Martin book.
10
+ # Daniel Jurafsky,‎ James H. Martin: "Speech and Language Processing";
11
+ # 2009, Pearson Education, Inc., ISBN 978-0135041963
12
+ # It defines the syntax of a sentence in a mini English-like language
13
+ builder = Rley::Syntax::GrammarBuilder.new do
14
+ add_terminals('Pronoun', 'Proper-Noun')
15
+ add_terminals('Determiner', 'Noun')
16
+ add_terminals('Cardinal_number', 'Ordinal_number', 'Quant')
17
+ add_terminals('Verb', 'GerundV', 'Aux')
18
+ add_terminals('Predeterminer', 'Preposition')
19
+
20
+ rule 'language' => 'sentence'
21
+ rule 'sentence' => 'declarative'
22
+ rule 'sentence' => 'imperative'
23
+ rule 'sentence' => 'yes_no_question'
24
+ rule 'sentence' => 'wh_subject_question'
25
+ rule 'sentence' => 'wh_non_subject_question'
26
+ rule 'declarative' => %w[NP VP]
27
+ rule 'imperative' => 'VP'
28
+ rule 'yes_no_question' => %w[Aux NP VP]
29
+ rule 'wh_subject_question' => %w[Wh_NP NP VP]
30
+ rule 'wh_non_subject_question' => %w[Wh_NP Aux NP VP]
31
+ rule 'NP' => %[Predeterminer NP]
32
+ rule 'NP' => 'Pronoun'
33
+ rule 'NP' => 'Proper-Noun'
34
+ rule 'NP' => %w[Det Card Ord Quant Nominal]
35
+ rule 'VP' => 'Verb'
36
+ rule 'VP' => %w[Verb NP]
37
+ rule 'VP' => %w[Verb NP PP]
38
+ rule 'VP' => %w[Verb PP]
39
+ rule 'Det' => 'Determiner'
40
+ rule 'Det' => []
41
+ rule 'Card' => 'Cardinal_number'
42
+ rule 'Card' => []
43
+ rule 'Ord' => 'Ordinal_number'
44
+ rule 'Ord' => []
45
+ rule 'Nominal' => 'Noun'
46
+ rule 'Nominal' => %[Nominal Noun]
47
+ rule 'Nominal' => %w[Nominal GerundVP]
48
+ rule 'Nominal' => %w[Nominal RelClause]
49
+ rule 'PP' => %w[Preposition NP]
50
+ rule 'GerundVP' => 'GerundV'
51
+ rule 'GerundVP' => %w[GerundV NP]
52
+ rule 'GerundVP' => %w[GerundV NP PP]
53
+ rule 'GerundVP' => %w[GerundV PP]
54
+ rule 'RelClause' => %w[Relative_pronoun VP]
55
+
56
+ end
57
+
58
+ # And now build the grammar...
59
+ NanoGrammar = builder.grammar
@@ -1,12 +1,12 @@
1
1
  require 'rley' # Load Rley library
2
2
 
3
3
  ########################################
4
- # Step 1. Define a grammar for a micro English-like language
4
+ # Step 1. Define a grammar for a pico English-like language
5
5
  # based on example from NLTK book (chapter 8 of the book).
6
6
  # Bird, Steven, Edward Loper and Ewan Klein: "Natural Language Processing
7
7
  # with Python"; 2009, O’Reilly Media Inc., ISBN 978-0596516499
8
8
  # It defines the syntax of a sentence in a mini English-like language
9
- # with a very simplified syntax.
9
+ # with a very simplified syntax and vocabulary
10
10
 
11
11
  # Instantiate a builder object that will build the grammar for us
12
12
  builder = Rley::Syntax::GrammarBuilder.new do
@@ -0,0 +1,176 @@
1
+ require_relative 'ast_building'
2
+ require_relative 'regex_repr'
3
+
4
+ # The purpose of a ASTBuilder is to build piece by piece an AST
5
+ # (Abstract Syntax Tree) from a sequence of input tokens and
6
+ # visit events produced by walking over a GFGParsing object.
7
+ # Uses the Builder GoF pattern.
8
+ # The Builder pattern creates a complex object
9
+ # (say, a parse tree) from simpler objects (terminal and non-terminal
10
+ # nodes) and using a step by step approach.
11
+ class ASTBuilder < Rley::Parser::ParseTreeBuilder
12
+ include ASTBuilding
13
+
14
+ Terminal2NodeClass = { }.freeze
15
+
16
+ protected
17
+
18
+ # Overriding method.
19
+ # Factory method for creating a node object for the given
20
+ # input token.
21
+ # @param aTerminal [Terminal] Terminal symbol associated with the token
22
+ # @param aTokenPosition [Integer] Position of token in the input stream
23
+ # @param aToken [Token] The input token
24
+ def new_leaf_node(aProduction, aTerminal, aTokenPosition, aToken)
25
+ node = Rley::PTree::TerminalNode.new(aToken, aTokenPosition)
26
+
27
+ return node
28
+ end
29
+
30
+ # Method to override.
31
+ # Factory method for creating a parent node object.
32
+ # @param aProduction [Production] Production rule
33
+ # @param aRange [Range] Range of tokens matched by the rule
34
+ # @param theTokens [Array] The input tokens
35
+ # @param theChildren [Array] Children nodes (one per rhs symbol)
36
+ def new_parent_node(aProduction, aRange, theTokens, theChildren)
37
+ node = case aProduction.name
38
+ when 'srl_0' # rule 'srl' => 'quantifier'
39
+ return_first_child(aRange, theTokens, theChildren)
40
+
41
+ when 'quantifier_0' # rule 'quantifier' => 'ONCE'
42
+ multiplicity(1, 1)
43
+
44
+ when 'quantifier_1' # rule 'quantifier' => 'TWICE'
45
+ multiplicity(2, 2)
46
+
47
+ when 'quantifier_2' # rule 'quantifier' => %w[EXACTLY count TIMES]
48
+ reduce_quantifier_2(aProduction, aRange, theTokens, theChildren)
49
+
50
+ when 'quantifier_3' # rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
51
+ reduce_quantifier_3(aProduction, aRange, theTokens, theChildren)
52
+
53
+ when 'quantifier_4' # rule 'quantifier' => 'OPTIONAL'
54
+ multiplicity(0, 1)
55
+
56
+ when 'quantifier_5' # rule 'quantifier' => %w[ONCE OR MORE]
57
+ multiplicity(1, :more)
58
+
59
+ when 'quantifier_6' # rule 'quantifier' => %w[NEVER OR MORE]
60
+ multiplicity(0, :more)
61
+
62
+ when 'quantifier_7' # rule 'quantifier' => %w[AT LEAST count TIMES]
63
+ reduce_quantifier_7(aProduction, aRange, theTokens, theChildren)
64
+
65
+ when 'count_0', 'count_1'
66
+ return_first_child(aRange, theTokens, theChildren)
67
+
68
+ when 'times_suffix_0', 'times_suffix_1'
69
+ nil
70
+ else
71
+ raise StandardError, "Don't know production #{aProduction.name}"
72
+ end
73
+
74
+ return node
75
+ end
76
+
77
+ def multiplicity(lowerBound, upperBound)
78
+ return SRL::Regex::Multiplicity.new(lowerBound, upperBound, :greedy)
79
+ end
80
+
81
+ # rule 'quantifier' => %w[EXACTLY count TIMES]
82
+ def reduce_quantifier_2(aProduction, aRange, theTokens, theChildren)
83
+ count = theChildren[1].token.lexeme.to_i
84
+ multiplicity(count, count)
85
+ end
86
+
87
+ # rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
88
+ def reduce_quantifier_3(aProduction, aRange, theTokens, theChildren)
89
+ upper = theChildren[3].token.lexeme.to_i
90
+ # lower = theChildren[1].token.lexeme.to_i
91
+ multiplicity(3, upper)
92
+ end
93
+
94
+ # rule 'quantifier' => %w[AT LEAST count TIMES]
95
+ def reduce_quantifier_7(aProduction, aRange, theTokens, theChildren)
96
+ count = theChildren[2].token.lexeme.to_i
97
+ multiplicity(count, :more)
98
+ end
99
+
100
+
101
+ =begin
102
+ def reduce_binary_operator(theChildren)
103
+ operator_node = theChildren[1]
104
+ operator_node.children << theChildren[0]
105
+ operator_node.children << theChildren[2]
106
+ return operator_node
107
+ end
108
+
109
+ # rule 'simple_expression' => %w[simple_expression add_operator term]
110
+ def reduce_simple_expression_1(_production, _range, _tokens, theChildren)
111
+ reduce_binary_operator(theChildren)
112
+ end
113
+
114
+ # rule 'term' => %w[term mul_operator factor]
115
+ def reduce_term_1(_production, _range, _tokens, theChildren)
116
+ reduce_binary_operator(theChildren)
117
+ end
118
+
119
+ # rule 'factor' => %w[simple_factor POWER simple_factor]]
120
+ def reduce_factor_1(aProduction, aRange, theTokens, theChildren)
121
+ result = PowerNode.new(theChildren[1].symbol, aRange)
122
+ result.children << theChildren[0]
123
+ result.children << theChildren[2]
124
+
125
+ return result
126
+ end
127
+
128
+ # rule 'simple_factor' => %[sign scalar]
129
+ def reduce_simple_factor_0(aProduction, aRange, theTokens, theChildren)
130
+ first_child = theChildren[0]
131
+ result = if first_child.kind_of?(CalcNegateNode)
132
+ -theChildren[1]
133
+ else
134
+ theChildren[1]
135
+ end
136
+
137
+ return result
138
+ end
139
+
140
+ # rule 'simple_factor' => %w[unary_function in_parenthesis]
141
+ def reduce_simple_factor_1(aProduction, aRange, theTokens, theChildren)
142
+ func = CalcUnaryFunction.new(theChildren[0].symbol, aRange.low)
143
+ func.func_name = theChildren[0].value
144
+ func.children << theChildren[1]
145
+ return func
146
+ end
147
+
148
+ # rule 'simple_factor' => %w[MINUS in_parenthesis]
149
+ def reduce_simple_factor_2(aProduction, aRange, theTokens, theChildren)
150
+ negation = CalcNegateNode.new(theChildren[0].symbol, aRange.low)
151
+ negation.children << theChildren[1]
152
+ return negation
153
+ end
154
+
155
+ # rule 'add_operator' => 'PLUS'
156
+ def reduce_add_operator_0(_production, aRange, _tokens, theChildren)
157
+ return CalcAddNode.new(theChildren[0].symbol, aRange)
158
+ end
159
+
160
+ # rule 'add_operator' => 'MINUS'
161
+ def reduce_add_operator_1(_production, aRange, _tokens, theChildren)
162
+ return CalcSubtractNode.new(theChildren[0].symbol, aRange)
163
+ end
164
+
165
+ # rule 'mul_operator' => 'STAR'
166
+ def reduce_mul_operator_0(_production, aRange, _tokens, theChildren)
167
+ return CalcMultiplyNode.new(theChildren[0].symbol, aRange)
168
+ end
169
+
170
+ # rule 'mul_operator' => 'DIVIDE'
171
+ def reduce_mul_operator_1(_production, aRange, _tokens, theChildren)
172
+ return CalcDivideNode.new(theChildren[0].symbol, aRange)
173
+ end
174
+ =end
175
+ end # class
176
+ # End of file
@@ -0,0 +1,20 @@
1
+ # Mix-in module that provides convenenience methods for
2
+ # constructing an AST (Abstract Syntax Tree).
3
+ module ASTBuilding
4
+ def return_first_child(_range, _tokens, theChildren)
5
+ return theChildren[0]
6
+ end
7
+
8
+ def return_second_child(_range, _tokens, theChildren)
9
+ return theChildren[1]
10
+ end
11
+
12
+ def return_last_child(_range, _tokens, theChildren)
13
+ return theChildren[-1]
14
+ end
15
+
16
+ def return_epsilon(_range, _tokens, _children)
17
+ return nil
18
+ end
19
+ end # module
20
+ # End of file
@@ -0,0 +1,32 @@
1
+ # Grammar for SRL (Simple Regex Language)
2
+ require 'rley' # Load the gem
3
+ module SRL
4
+ ########################################
5
+ # Work in progress.
6
+ # This is a very partial grammar of SRL.
7
+ # It will be expanded with the coming versions of Rley
8
+ builder = Rley::Syntax::GrammarBuilder.new do
9
+ add_terminals('DIGIT', 'INTEGER')
10
+ add_terminals('EXACTLY', 'TIMES', 'ONCE', 'TWICE')
11
+ add_terminals('BETWEEN', 'AND', 'OPTIONAL', 'OR')
12
+ add_terminals('MORE', 'NEVER', 'AT', 'LEAST')
13
+
14
+ # For the moment one focuses on quantifier syntax only...
15
+ rule 'srl' => 'quantifier'
16
+ rule 'quantifier' => 'ONCE'
17
+ rule 'quantifier' => 'TWICE'
18
+ rule 'quantifier' => %w[EXACTLY count TIMES]
19
+ rule 'quantifier' => %w[BETWEEN count AND count times_suffix]
20
+ rule 'quantifier' => 'OPTIONAL'
21
+ rule 'quantifier' => %w[ONCE OR MORE]
22
+ rule 'quantifier' => %w[NEVER OR MORE]
23
+ rule 'quantifier' => %w[AT LEAST count TIMES]
24
+ rule 'count' => 'DIGIT'
25
+ rule 'count' => 'INTEGER'
26
+ rule 'times_suffix' => 'TIMES'
27
+ rule 'times_suffix' => []
28
+ end
29
+
30
+ # And now build the grammar and make it accessible via a global constant
31
+ Grammar = builder.grammar
32
+ end # module