rley 0.0.02

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +15 -0
  2. data/.rspec +1 -0
  3. data/.rubocop.yml +74 -0
  4. data/.ruby-gemset +1 -0
  5. data/.ruby-version +1 -0
  6. data/.simplecov +7 -0
  7. data/.travis.yml +21 -0
  8. data/.yardopts +6 -0
  9. data/CHANGELOG.md +10 -0
  10. data/Gemfile +8 -0
  11. data/LICENSE.txt +19 -0
  12. data/README.md +19 -0
  13. data/Rakefile +32 -0
  14. data/lib/rley/constants.rb +26 -0
  15. data/lib/rley/parser/chart.rb +39 -0
  16. data/lib/rley/parser/dotted_item.rb +80 -0
  17. data/lib/rley/parser/earley_parser.rb +177 -0
  18. data/lib/rley/parser/parse_state.rb +54 -0
  19. data/lib/rley/parser/parsing.rb +101 -0
  20. data/lib/rley/parser/state_set.rb +47 -0
  21. data/lib/rley/parser/token.rb +21 -0
  22. data/lib/rley/syntax/grammar.rb +59 -0
  23. data/lib/rley/syntax/grm_symbol.rb +18 -0
  24. data/lib/rley/syntax/literal.rb +20 -0
  25. data/lib/rley/syntax/non_terminal.rb +18 -0
  26. data/lib/rley/syntax/production.rb +42 -0
  27. data/lib/rley/syntax/symbol_seq.rb +36 -0
  28. data/lib/rley/syntax/terminal.rb +18 -0
  29. data/lib/rley/syntax/verbatim_symbol.rb +21 -0
  30. data/spec/rley/parser/chart_spec.rb +47 -0
  31. data/spec/rley/parser/dotted_item_spec.rb +108 -0
  32. data/spec/rley/parser/earley_parser_spec.rb +271 -0
  33. data/spec/rley/parser/parse_state_spec.rb +99 -0
  34. data/spec/rley/parser/parsing_spec.rb +118 -0
  35. data/spec/rley/parser/state_set_spec.rb +68 -0
  36. data/spec/rley/parser/token_spec.rb +40 -0
  37. data/spec/rley/syntax/grammar_spec.rb +149 -0
  38. data/spec/rley/syntax/grm_symbol_spec.rb +29 -0
  39. data/spec/rley/syntax/literal_spec.rb +32 -0
  40. data/spec/rley/syntax/non_terminal_spec.rb +29 -0
  41. data/spec/rley/syntax/production_spec.rb +50 -0
  42. data/spec/rley/syntax/symbol_seq_spec.rb +65 -0
  43. data/spec/rley/syntax/terminal_spec.rb +29 -0
  44. data/spec/rley/syntax/verbatim_symbol_spec.rb +32 -0
  45. data/spec/spec_helper.rb +21 -0
  46. metadata +166 -0
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ZTgxOWU0YmIzMDdlZmQ3NGVlZDBkYzcxZTEzNDQ4NDgwMWM3ZmZiOA==
5
+ data.tar.gz: !binary |-
6
+ ZmZkNWZlZDgwZWQ2ZTYzYjA5ZjkyNTZlZjMwMGZmMjIwNjVjODFjNQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ ZGY2YzBlMTM0MTNiZWE3ZjQyYmNjOWMzYWQ4ZTY4N2RjMDQ4YzExMTg5MTM5
10
+ NjFlMDRlZjYyYWM5NDJmYzlhNzY3YWE3N2FiZDVhNGM2NzVhOGMwZjZmZjE0
11
+ YjhkZjFiNGVlOTQwMmZjZjkzNWQ3ZGY3NGM1Y2M4YWU3ZjE3MDI=
12
+ data.tar.gz: !binary |-
13
+ MDViNDQ3MjBjOTg1MWI2NmJmNmRhZTg2MzQ0MmRlMDZmY2JmMDhiNTZlY2Zi
14
+ NTMwZDdlNGI1MWIwMzkxN2FiNjMyZjk4ZWViZjk0YzJlMTY0MmMyZmVlN2U3
15
+ MGJiYjFlNDE5NzM0MzhlMWQzNGIyMDBmOTJkZDQwNDYyNDVjNjQ=
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --backtrace
data/.rubocop.yml ADDED
@@ -0,0 +1,74 @@
1
+ AllCops:
2
+ Exclude:
3
+ - 'examples/**/*'
4
+ - 'features/**/*'
5
+ - 'gems/**/*'
6
+
7
+ # This is disabled because some demos use UTF-8
8
+ AsciiComments:
9
+ Enabled: false
10
+
11
+ CaseIndentation:
12
+ IndentWhenRelativeTo: end
13
+ IndentOneStep: true
14
+
15
+ # Rubocop enforces the use of is_a? instead of kind_of?
16
+ # Which is contrary to modelling practice.
17
+ ClassCheck:
18
+ Enabled: false
19
+
20
+ ClassLength:
21
+ Max: 250
22
+ CountComments: false
23
+
24
+ ConstantName:
25
+ Enabled: false
26
+
27
+ CyclomaticComplexity:
28
+ Enabled: false
29
+
30
+ DefWithParentheses:
31
+ Enabled: false
32
+
33
+ Documentation:
34
+ Enabled: false
35
+
36
+ EmptyLines:
37
+ Enabled: false
38
+
39
+ EmptyLinesAroundBody:
40
+ Enabled: false
41
+
42
+ Encoding:
43
+ Enabled: false
44
+
45
+ FileName:
46
+ Enabled: false
47
+
48
+ IndentationWidth :
49
+ Enabled: false
50
+
51
+
52
+ # Avoid methods longer than 50 lines of code
53
+ MethodLength:
54
+ Max: 50
55
+ CountComments: false
56
+
57
+ NonNilCheck:
58
+ Enabled: false
59
+
60
+ NumericLiterals:
61
+ Enabled: false
62
+
63
+ RaiseArgs:
64
+ Enabled: false
65
+
66
+ RedundantReturn:
67
+ Enabled: false
68
+
69
+ SpaceInsideBrackets:
70
+ Enabled: false
71
+
72
+ TrailingWhitespace:
73
+ Enabled: false
74
+
data/.ruby-gemset ADDED
@@ -0,0 +1 @@
1
+ rley
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 1.9.3
data/.simplecov ADDED
@@ -0,0 +1,7 @@
1
+ # .simplecov
2
+ # Configuration
3
+
4
+ SimpleCov.start do
5
+ # Remove all files that match /spec/ in their path
6
+ add_filter "/spec/"
7
+ end
data/.travis.yml ADDED
@@ -0,0 +1,21 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.1.0
4
+ - 2.0.0
5
+ - 1.9.3
6
+ - 1.9.2
7
+ - jruby-19mode
8
+ - jruby-head
9
+
10
+ # Workaround issue of jruby-head configuration on Travis CI
11
+ matrix:
12
+ allow_failures:
13
+ - rvm: jruby-head
14
+
15
+ gemfile:
16
+ - Gemfile
17
+
18
+ # whitelist
19
+ branches:
20
+ only:
21
+ - master
data/.yardopts ADDED
@@ -0,0 +1,6 @@
1
+ --exclude examples --exclude features --exclude spec
2
+ --no-private
3
+ --markup markdown
4
+ -
5
+ Changelog.md
6
+ License.txt
data/CHANGELOG.md ADDED
@@ -0,0 +1,10 @@
1
+ ### 0.0.02 / 2014-11-12
2
+ * [CHANGE] File `README.md`: Added Travis CI badge.
3
+
4
+
5
+ ### 0.0.01 / 2014-11-12
6
+ * [CHANGE] Rley is "gemmified"!
7
+
8
+
9
+ ### 0.0.00 / 2014-11-07
10
+ * [FEATURE] Initial public working version
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+ # Prevent Bundler to load the dependencies from our .gemspec file
3
+
4
+ group :development do
5
+ gem 'rake', '>= 0.8.0'
6
+ gem 'rspec', '>= 3.0.0'
7
+ gem 'simplecov', '>= 0.5.0'
8
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2014 Dimitri Geshef
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,19 @@
1
+ Rley
2
+ ===========
3
+ [Homepage](https://github.com/famished-tiger/Rley)
4
+
5
+
6
+ [![Build Status](https://travis-ci.org/famished-tiger/Rley.svg?branch=master)](https://travis-ci.org/famished-tiger/Rley)
7
+
8
+ ### What is Rley? ###
9
+ __Rley__ is a Ruby implementation of a Earley parser.
10
+ The objective is to build a parser convenient for lightweight NLP (Natural Language Processing) purposes.
11
+
12
+ This project is in "early" stage.
13
+ Consult Wikipedia to learn more about Earley's parsing algorithm.
14
+
15
+
16
+ Copyright
17
+ ---------
18
+ Copyright (c) 2014, Dimitri Geshef.
19
+ __Rley__ is released under the MIT License see [LICENSE.txt](https://github.com/famished-tiger/Rley/blob/master/LICENSE.txt) for details.
data/Rakefile ADDED
@@ -0,0 +1,32 @@
1
+ require 'rubygems'
2
+ require_relative './lib/rley/constants'
3
+
4
+ namespace :gem do
5
+
6
+ desc 'Push the gem to rubygems.org'
7
+ task :push do
8
+ system("gem push rley-#{Rley::Version}.gem")
9
+ end
10
+
11
+ end # namespace
12
+
13
+
14
+ # Testing-specific tasks
15
+
16
+ # RSpec as testing tool
17
+ require 'rspec/core/rake_task'
18
+ desc 'Run RSpec'
19
+ RSpec::Core::RakeTask.new do |spec|
20
+ spec.pattern = 'spec/**/*_spec.rb'
21
+ end
22
+
23
+
24
+ # Combine RSpec tests
25
+ desc 'Run tests, with RSpec'
26
+ task test: [:spec]
27
+
28
+
29
+ # Default rake task
30
+ task default: :test
31
+
32
+ # End of file
@@ -0,0 +1,26 @@
1
+ # File: constants.rb
2
+ # Purpose: definition of Rley constants.
3
+
4
+ module Rley # Module used as a namespace
5
+ # The version number of the gem.
6
+ Version = '0.0.02'
7
+
8
+ # Brief description of the gem.
9
+ Description = "Ruby implementation of the Earley's parsing algorithm"
10
+
11
+ # Constant Rley::RootDir contains the absolute path of Rley's
12
+ # start directory. Note: it also ends with a slash character.
13
+ unless defined?(RootDir)
14
+ # The initialisation of constant RootDir is guarded in order
15
+ # to avoid multiple initialisation (not allowed for constants)
16
+
17
+ # The start folder of Rley.
18
+ RootDir = begin
19
+ require 'pathname' # Load Pathname class from standard library
20
+ startdir = Pathname(__FILE__).dirname.parent.parent.expand_path
21
+ startdir.to_s + '/' # Append trailing slash character to it
22
+ end
23
+ end
24
+ end # module
25
+
26
+ # End of file
@@ -0,0 +1,39 @@
1
+ require_relative 'state_set'
2
+ require_relative 'parse_state'
3
+
4
+ module Rley # This module is used as a namespace
5
+ module Parser # This module is used as a namespace
6
+ # Also called a parse table
7
+ # A one-dimensional array with n + 1 entries (n = number of input tokens).
8
+ class Chart
9
+ attr_reader(:state_sets)
10
+
11
+ def initialize(startDottedItem, tokenCount)
12
+ @state_sets = Array.new(tokenCount + 1) {|_| StateSet.new }
13
+ push_state(startDottedItem, 0, 0)
14
+ end
15
+
16
+ # The dotted item/rule used to seed the parse chart.
17
+ # It corresponds to the start production and a dot placed
18
+ # at the beginning of the rhs
19
+ def start_dotted_rule()
20
+ return self[0].states.first.dotted_rule
21
+ end
22
+
23
+ # Access the state set at given position
24
+ def [](index)
25
+ return state_sets[index]
26
+ end
27
+
28
+ # Push a parse state for the chart entry with given index
29
+ def push_state(aDottedItem, anOrigin, anIndex)
30
+ new_state = ParseState.new(aDottedItem, anOrigin)
31
+ self[anIndex].push_state(new_state)
32
+ end
33
+
34
+ end # class
35
+
36
+ end # module
37
+ end # module
38
+
39
+ # End of file
@@ -0,0 +1,80 @@
1
+ # A dotted item is a parse state for a given production/grammar rule
2
+ # It partitions the rhs of the rule in two parts.
3
+ # The left part consists of the symbols in the rules that are matched
4
+ # by the input tokens.
5
+ # The right part consists of symbols that are predicted to match the
6
+ # input tokens.
7
+ # The terminology stems from the traditional way to visualize the partition
8
+ # by using a fat dot character as a separator between the left and right parts
9
+ # An item with the dot at the beginning (i.e. before any rhs symbol)
10
+ # is called a predicted item.
11
+ # An item with the dot at the end (i.e. after all rhs symbols)
12
+ # is called a reduce item.
13
+ # An item with a dot in front of a terminal is called a shift item.
14
+ class DottedItem
15
+ # Production rule
16
+ attr_reader(:production)
17
+
18
+ # Index of the next symbol (from the rhs) after the 'dot'.
19
+ # If the dot is at the end of the rhs (i.e.) there is no next
20
+ # symbol, then the position takes the value -1.
21
+ # It the rhs is empty, then the postion is -2
22
+ attr_reader(:position)
23
+
24
+ # @param aProduction
25
+ def initialize(aProduction, aPosition)
26
+ @production = aProduction
27
+ @position = valid_position(aPosition)
28
+ end
29
+
30
+ # Return true if the dot position is at the start of the rhs.
31
+ def at_start?()
32
+ return position == 0 || position == -2
33
+ end
34
+
35
+ # An item with the dot at the beginning is called
36
+ # predicted item
37
+ alias :predicted_item? :at_start?
38
+
39
+ # A dotted item is called a reduce item if the dot is at the end.
40
+ def reduce_item?()
41
+ return position < 0 # Either -1 or -2
42
+ end
43
+
44
+ # The non-terminal symbol that is on the left-side of the production
45
+ def lhs()
46
+ return production.lhs
47
+ end
48
+
49
+ # Return the symbol after the dot.
50
+ # nil is returned if the dot is at the end
51
+ def next_symbol()
52
+ result = (position < 0) ? nil : production.rhs[position]
53
+ end
54
+
55
+ # An item with the dot in front of a terminal is called a shift item
56
+ def shift_item?()
57
+ end
58
+
59
+ private
60
+
61
+ # Return the given after its validation.
62
+ def valid_position(aPosition)
63
+ rhs_size = production.rhs.size
64
+ if aPosition < 0 || aPosition > rhs_size
65
+ fail StandardError, 'Out of bound index'
66
+ end
67
+
68
+ if rhs_size == 0
69
+ index = -2 # Minus 2 at start/end of empty production
70
+ elsif aPosition == rhs_size
71
+ index = -1 # Minus 1 at end of non-empty production
72
+ else
73
+ index = aPosition
74
+ end
75
+
76
+ return index
77
+ end
78
+ end # class
79
+
80
+ # End of file
@@ -0,0 +1,177 @@
1
+ require_relative '../syntax/grammar'
2
+ require_relative 'dotted_item'
3
+ require_relative 'parsing'
4
+
5
+ module Rley # This module is used as a namespace
6
+ module Parser # This module is used as a namespace
7
+
8
+ # Implementation of a parser that uses the Earley parsing algorithm.
9
+ class EarleyParser
10
+ # The grammar of the language.
11
+ attr_reader(:grammar)
12
+
13
+ # The dotted items/rules for the productions of the grammar
14
+ attr_reader(:dotted_items)
15
+
16
+ # A Hash that defines the mapping: non-terminal => [start dotted items]
17
+ attr_reader(:start_mapping)
18
+
19
+ # A Hash that defines the mapping: dotted item => next dotted item
20
+ # In other words, the 'next_mapping' allows to find the dotted item
21
+ # after "advancing" the dot
22
+ attr_reader(:next_mapping)
23
+
24
+
25
+ def initialize(aGrammar)
26
+ @grammar = aGrammar
27
+ @dotted_items = build_dotted_items(grammar)
28
+ @start_mapping = build_start_mapping(dotted_items)
29
+ @next_mapping = build_next_mapping(dotted_items)
30
+ end
31
+
32
+ def parse(aTokenSequence)
33
+ result = Parsing.new(start_dotted_item, aTokenSequence)
34
+
35
+ (0..aTokenSequence.size).each do |i|
36
+ result.chart[i].each do |state|
37
+ if state.complete?
38
+ # parse reached end of production
39
+ completion(result, state, i)
40
+ else
41
+ next_symbol = state.next_symbol
42
+ if next_symbol.kind_of?(Syntax::NonTerminal)
43
+ prediction(result, next_symbol, i)
44
+ else
45
+ # Expecting a terminal symbol
46
+ scanning(result, next_symbol, i)
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+ return result
53
+ end
54
+
55
+ private
56
+
57
+ def build_dotted_items(aGrammar)
58
+ items = []
59
+ aGrammar.rules.each do |prod|
60
+ rhs_size = prod.rhs.size
61
+ if rhs_size == 0
62
+ items << DottemItem.new(prod, 0)
63
+ else
64
+ items += (0..rhs_size).map { |i| DottedItem.new(prod, i) }
65
+ end
66
+ end
67
+
68
+ return items
69
+ end
70
+
71
+ # Create a Hash with pairs of the kind:
72
+ # non-terminal => [start dotted items]
73
+ def build_start_mapping(theDottedItems)
74
+ mapping = {}
75
+ theDottedItems.each do |item|
76
+ next unless item.at_start?
77
+
78
+ lhs_symbol = item.lhs
79
+ map_entry = mapping.fetch(lhs_symbol, [])
80
+ map_entry << item
81
+ mapping[lhs_symbol] = map_entry
82
+ end
83
+
84
+ return mapping
85
+ end
86
+
87
+ # Create a Hash with pairs of the kind:
88
+ # dotted item => next dotted item
89
+ # next dotted item uses same production and the dot
90
+ # position is advanced by one symbol
91
+ def build_next_mapping(theDottedItems)
92
+ mapping = {}
93
+ theDottedItems.each_cons(2) do |(item1, item2)|
94
+ next if item1.production != item2.production
95
+ mapping[item1] = item2
96
+ end
97
+
98
+ return mapping
99
+ end
100
+
101
+ # The dotted item for the start production and
102
+ # with the dot at the beginning of the rhs
103
+ def start_dotted_item()
104
+ # TODO: remove assumption that first dotted_item is
105
+ # for start production
106
+ return dotted_items[0]
107
+ end
108
+
109
+
110
+ # This method is called when a parse state for chart entry at position
111
+ # 'pos' expects as next symbol a non-terminal.
112
+ # Given a predicted non-terminal 'nt' and a current token position
113
+ # 'pos':
114
+ # For each production with 'nt' as lhs, retrieve their corresponding
115
+ # initial dotted rules nt -> . xxxx
116
+ # For retrieved dotted rule, add a parse state to the chart entry at 'pos':
117
+ # <initial dotted rule, pos, pos>
118
+ # In short, one adds states to chart[pos], one per production that
119
+ # specifies how to reduce some input into the predicted nt (non-terminal)
120
+ # A prediction corresponds to a potential expansion of a nonterminal
121
+ # in a left-most derivation.
122
+ # @param aParsing [Parsing] the object that encapsulates the results
123
+ # result of the parsing process
124
+ # @param aNonTerminal [NonTerminal] a non-terminal symbol that
125
+ # immediately follows a dot
126
+ # (= is expected/predicted by the production rule)
127
+ # @param aPosition [Fixnum] position in the input token sequence.
128
+ def prediction(aParsing, aNonTerminal, aPosition)
129
+ # Retrieve all start dotted items for productions
130
+ # with aNonTerminal as its lhs
131
+ items = start_mapping[aNonTerminal]
132
+ items.each do |an_item|
133
+ aParsing.push_state(an_item, aPosition, aPosition)
134
+ end
135
+ end
136
+
137
+ # This method is called when a parse state for chart entry at position
138
+ # 'pos' expects a terminal as next symbol.
139
+ # If the input token matches the terminal symbol then:
140
+ # Retrieve all parse states for chart entry at 'aPosition'
141
+ # that have the given terminal as next symbol.
142
+ # For each s of the above states, push to chart entry aPosition + 1
143
+ # a new state like: <next dotted rule, s.origin, aPosition + 1>
144
+ # In other words, we place the dotted rules in the next state set
145
+ # such that the dot appears after terminal.
146
+ # @param aParsing [Parsing] the object that encapsulates the results
147
+ # result of the parsing process
148
+ # @param Terminal [Terminal] a terminal symbol that
149
+ # immediately follows a dot
150
+ # @param aPosition [Fixnum] position in the input token sequence.
151
+ def scanning(aParsing, aTerminal, aPosition)
152
+ aParsing.scanning(aTerminal, aPosition) { |item|
153
+ next_mapping[item]
154
+ }
155
+ end
156
+
157
+
158
+ # This method is called when a parse state at chart entry reaches the end
159
+ # of a production.
160
+ # For every state in chart[aPosition] that is complete (i.e. of the form:
161
+ # { dotted_rule: X -> γ •, origin: j}),
162
+ # Find states s in chart[j] of the form {dotted_rule: Y -> α • X β, origin: i}
163
+ # In other words, rules that predicted the non-terminal X.
164
+ # For each s, add to chart[aPosition] a state of the form
165
+ # { dotted_rule: Y → α X • β, origin: i})
166
+ def completion(aParsing, aState, aPosition)
167
+ aParsing.completion(aState, aPosition) { |item|
168
+ next_mapping[item]
169
+ }
170
+ end
171
+
172
+ end # class
173
+
174
+ end # module
175
+ end # module
176
+
177
+ # End of file