rley 0.0.02

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +15 -0
  2. data/.rspec +1 -0
  3. data/.rubocop.yml +74 -0
  4. data/.ruby-gemset +1 -0
  5. data/.ruby-version +1 -0
  6. data/.simplecov +7 -0
  7. data/.travis.yml +21 -0
  8. data/.yardopts +6 -0
  9. data/CHANGELOG.md +10 -0
  10. data/Gemfile +8 -0
  11. data/LICENSE.txt +19 -0
  12. data/README.md +19 -0
  13. data/Rakefile +32 -0
  14. data/lib/rley/constants.rb +26 -0
  15. data/lib/rley/parser/chart.rb +39 -0
  16. data/lib/rley/parser/dotted_item.rb +80 -0
  17. data/lib/rley/parser/earley_parser.rb +177 -0
  18. data/lib/rley/parser/parse_state.rb +54 -0
  19. data/lib/rley/parser/parsing.rb +101 -0
  20. data/lib/rley/parser/state_set.rb +47 -0
  21. data/lib/rley/parser/token.rb +21 -0
  22. data/lib/rley/syntax/grammar.rb +59 -0
  23. data/lib/rley/syntax/grm_symbol.rb +18 -0
  24. data/lib/rley/syntax/literal.rb +20 -0
  25. data/lib/rley/syntax/non_terminal.rb +18 -0
  26. data/lib/rley/syntax/production.rb +42 -0
  27. data/lib/rley/syntax/symbol_seq.rb +36 -0
  28. data/lib/rley/syntax/terminal.rb +18 -0
  29. data/lib/rley/syntax/verbatim_symbol.rb +21 -0
  30. data/spec/rley/parser/chart_spec.rb +47 -0
  31. data/spec/rley/parser/dotted_item_spec.rb +108 -0
  32. data/spec/rley/parser/earley_parser_spec.rb +271 -0
  33. data/spec/rley/parser/parse_state_spec.rb +99 -0
  34. data/spec/rley/parser/parsing_spec.rb +118 -0
  35. data/spec/rley/parser/state_set_spec.rb +68 -0
  36. data/spec/rley/parser/token_spec.rb +40 -0
  37. data/spec/rley/syntax/grammar_spec.rb +149 -0
  38. data/spec/rley/syntax/grm_symbol_spec.rb +29 -0
  39. data/spec/rley/syntax/literal_spec.rb +32 -0
  40. data/spec/rley/syntax/non_terminal_spec.rb +29 -0
  41. data/spec/rley/syntax/production_spec.rb +50 -0
  42. data/spec/rley/syntax/symbol_seq_spec.rb +65 -0
  43. data/spec/rley/syntax/terminal_spec.rb +29 -0
  44. data/spec/rley/syntax/verbatim_symbol_spec.rb +32 -0
  45. data/spec/spec_helper.rb +21 -0
  46. metadata +166 -0
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ZTgxOWU0YmIzMDdlZmQ3NGVlZDBkYzcxZTEzNDQ4NDgwMWM3ZmZiOA==
5
+ data.tar.gz: !binary |-
6
+ ZmZkNWZlZDgwZWQ2ZTYzYjA5ZjkyNTZlZjMwMGZmMjIwNjVjODFjNQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ ZGY2YzBlMTM0MTNiZWE3ZjQyYmNjOWMzYWQ4ZTY4N2RjMDQ4YzExMTg5MTM5
10
+ NjFlMDRlZjYyYWM5NDJmYzlhNzY3YWE3N2FiZDVhNGM2NzVhOGMwZjZmZjE0
11
+ YjhkZjFiNGVlOTQwMmZjZjkzNWQ3ZGY3NGM1Y2M4YWU3ZjE3MDI=
12
+ data.tar.gz: !binary |-
13
+ MDViNDQ3MjBjOTg1MWI2NmJmNmRhZTg2MzQ0MmRlMDZmY2JmMDhiNTZlY2Zi
14
+ NTMwZDdlNGI1MWIwMzkxN2FiNjMyZjk4ZWViZjk0YzJlMTY0MmMyZmVlN2U3
15
+ MGJiYjFlNDE5NzM0MzhlMWQzNGIyMDBmOTJkZDQwNDYyNDVjNjQ=
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --backtrace
data/.rubocop.yml ADDED
@@ -0,0 +1,74 @@
1
+ AllCops:
2
+ Exclude:
3
+ - 'examples/**/*'
4
+ - 'features/**/*'
5
+ - 'gems/**/*'
6
+
7
+ # This is disabled because some demos use UTF-8
8
+ AsciiComments:
9
+ Enabled: false
10
+
11
+ CaseIndentation:
12
+ IndentWhenRelativeTo: end
13
+ IndentOneStep: true
14
+
15
+ # Rubocop enforces the use of is_a? instead of kind_of?
16
+ # Which is contrary to modelling practice.
17
+ ClassCheck:
18
+ Enabled: false
19
+
20
+ ClassLength:
21
+ Max: 250
22
+ CountComments: false
23
+
24
+ ConstantName:
25
+ Enabled: false
26
+
27
+ CyclomaticComplexity:
28
+ Enabled: false
29
+
30
+ DefWithParentheses:
31
+ Enabled: false
32
+
33
+ Documentation:
34
+ Enabled: false
35
+
36
+ EmptyLines:
37
+ Enabled: false
38
+
39
+ EmptyLinesAroundBody:
40
+ Enabled: false
41
+
42
+ Encoding:
43
+ Enabled: false
44
+
45
+ FileName:
46
+ Enabled: false
47
+
48
+ IndentationWidth :
49
+ Enabled: false
50
+
51
+
52
+ # Avoid methods longer than 50 lines of code
53
+ MethodLength:
54
+ Max: 50
55
+ CountComments: false
56
+
57
+ NonNilCheck:
58
+ Enabled: false
59
+
60
+ NumericLiterals:
61
+ Enabled: false
62
+
63
+ RaiseArgs:
64
+ Enabled: false
65
+
66
+ RedundantReturn:
67
+ Enabled: false
68
+
69
+ SpaceInsideBrackets:
70
+ Enabled: false
71
+
72
+ TrailingWhitespace:
73
+ Enabled: false
74
+
data/.ruby-gemset ADDED
@@ -0,0 +1 @@
1
+ rley
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 1.9.3
data/.simplecov ADDED
@@ -0,0 +1,7 @@
1
+ # .simplecov
2
+ # Configuration
3
+
4
+ SimpleCov.start do
5
+ # Remove all files that match /spec/ in their path
6
+ add_filter "/spec/"
7
+ end
data/.travis.yml ADDED
@@ -0,0 +1,21 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.1.0
4
+ - 2.0.0
5
+ - 1.9.3
6
+ - 1.9.2
7
+ - jruby-19mode
8
+ - jruby-head
9
+
10
+ # Workaround issue of jruby-head configuration on Travis CI
11
+ matrix:
12
+ allow_failures:
13
+ - rvm: jruby-head
14
+
15
+ gemfile:
16
+ - Gemfile
17
+
18
+ # whitelist
19
+ branches:
20
+ only:
21
+ - master
data/.yardopts ADDED
@@ -0,0 +1,6 @@
1
+ --exclude examples --exclude features --exclude spec
2
+ --no-private
3
+ --markup markdown
4
+ -
5
+ Changelog.md
6
+ License.txt
data/CHANGELOG.md ADDED
@@ -0,0 +1,10 @@
1
+ ### 0.0.02 / 2014-11-12
2
+ * [CHANGE] File `README.md`: Added Travis CI badge.
3
+
4
+
5
+ ### 0.0.01 / 2014-11-12
6
+ * [CHANGE] Rley is "gemmified"!
7
+
8
+
9
+ ### 0.0.00 / 2014-11-07
10
+ * [FEATURE] Initial public working version
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+ # Prevent Bundler to load the dependencies from our .gemspec file
3
+
4
+ group :development do
5
+ gem 'rake', '>= 0.8.0'
6
+ gem 'rspec', '>= 3.0.0'
7
+ gem 'simplecov', '>= 0.5.0'
8
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2014 Dimitri Geshef
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,19 @@
1
+ Rley
2
+ ===========
3
+ [Homepage](https://github.com/famished-tiger/Rley)
4
+
5
+
6
+ [![Build Status](https://travis-ci.org/famished-tiger/Rley.svg?branch=master)](https://travis-ci.org/famished-tiger/Rley)
7
+
8
+ ### What is Rley? ###
9
+ __Rley__ is a Ruby implementation of a Earley parser.
10
+ The objective is to build a parser convenient for lightweight NLP (Natural Language Processing) purposes.
11
+
12
+ This project is in "early" stage.
13
+ Consult Wikipedia to learn more about Earley's parsing algorithm.
14
+
15
+
16
+ Copyright
17
+ ---------
18
+ Copyright (c) 2014, Dimitri Geshef.
19
+ __Rley__ is released under the MIT License see [LICENSE.txt](https://github.com/famished-tiger/Rley/blob/master/LICENSE.txt) for details.
data/Rakefile ADDED
@@ -0,0 +1,32 @@
1
+ require 'rubygems'
2
+ require_relative './lib/rley/constants'
3
+
4
+ namespace :gem do
5
+
6
+ desc 'Push the gem to rubygems.org'
7
+ task :push do
8
+ system("gem push rley-#{Rley::Version}.gem")
9
+ end
10
+
11
+ end # namespace
12
+
13
+
14
+ # Testing-specific tasks
15
+
16
+ # RSpec as testing tool
17
+ require 'rspec/core/rake_task'
18
+ desc 'Run RSpec'
19
+ RSpec::Core::RakeTask.new do |spec|
20
+ spec.pattern = 'spec/**/*_spec.rb'
21
+ end
22
+
23
+
24
+ # Combine RSpec tests
25
+ desc 'Run tests, with RSpec'
26
+ task test: [:spec]
27
+
28
+
29
+ # Default rake task
30
+ task default: :test
31
+
32
+ # End of file
@@ -0,0 +1,26 @@
1
+ # File: constants.rb
2
+ # Purpose: definition of Rley constants.
3
+
4
+ module Rley # Module used as a namespace
5
+ # The version number of the gem.
6
+ Version = '0.0.02'
7
+
8
+ # Brief description of the gem.
9
+ Description = "Ruby implementation of the Earley's parsing algorithm"
10
+
11
+ # Constant Rley::RootDir contains the absolute path of Rley's
12
+ # start directory. Note: it also ends with a slash character.
13
+ unless defined?(RootDir)
14
+ # The initialisation of constant RootDir is guarded in order
15
+ # to avoid multiple initialisation (not allowed for constants)
16
+
17
+ # The start folder of Rley.
18
+ RootDir = begin
19
+ require 'pathname' # Load Pathname class from standard library
20
+ startdir = Pathname(__FILE__).dirname.parent.parent.expand_path
21
+ startdir.to_s + '/' # Append trailing slash character to it
22
+ end
23
+ end
24
+ end # module
25
+
26
+ # End of file
@@ -0,0 +1,39 @@
1
+ require_relative 'state_set'
2
+ require_relative 'parse_state'
3
+
4
+ module Rley # This module is used as a namespace
5
+ module Parser # This module is used as a namespace
6
+ # Also called a parse table
7
+ # A one-dimensional array with n + 1 entries (n = number of input tokens).
8
+ class Chart
9
+ attr_reader(:state_sets)
10
+
11
+ def initialize(startDottedItem, tokenCount)
12
+ @state_sets = Array.new(tokenCount + 1) {|_| StateSet.new }
13
+ push_state(startDottedItem, 0, 0)
14
+ end
15
+
16
+ # The dotted item/rule used to seed the parse chart.
17
+ # It corresponds to the start production and a dot placed
18
+ # at the beginning of the rhs
19
+ def start_dotted_rule()
20
+ return self[0].states.first.dotted_rule
21
+ end
22
+
23
+ # Access the state set at given position
24
+ def [](index)
25
+ return state_sets[index]
26
+ end
27
+
28
+ # Push a parse state for the chart entry with given index
29
+ def push_state(aDottedItem, anOrigin, anIndex)
30
+ new_state = ParseState.new(aDottedItem, anOrigin)
31
+ self[anIndex].push_state(new_state)
32
+ end
33
+
34
+ end # class
35
+
36
+ end # module
37
+ end # module
38
+
39
+ # End of file
@@ -0,0 +1,80 @@
1
+ # A dotted item is a parse state for a given production/grammar rule
2
+ # It partitions the rhs of the rule in two parts.
3
+ # The left part consists of the symbols in the rules that are matched
4
+ # by the input tokens.
5
+ # The right part consists of symbols that are predicted to match the
6
+ # input tokens.
7
+ # The terminology stems from the traditional way to visualize the partition
8
+ # by using a fat dot character as a separator between the left and right parts
9
+ # An item with the dot at the beginning (i.e. before any rhs symbol)
10
+ # is called a predicted item.
11
+ # An item with the dot at the end (i.e. after all rhs symbols)
12
+ # is called a reduce item.
13
+ # An item with a dot in front of a terminal is called a shift item.
14
+ class DottedItem
15
+ # Production rule
16
+ attr_reader(:production)
17
+
18
+ # Index of the next symbol (from the rhs) after the 'dot'.
19
+ # If the dot is at the end of the rhs (i.e.) there is no next
20
+ # symbol, then the position takes the value -1.
21
+ # It the rhs is empty, then the postion is -2
22
+ attr_reader(:position)
23
+
24
+ # @param aProduction
25
+ def initialize(aProduction, aPosition)
26
+ @production = aProduction
27
+ @position = valid_position(aPosition)
28
+ end
29
+
30
+ # Return true if the dot position is at the start of the rhs.
31
+ def at_start?()
32
+ return position == 0 || position == -2
33
+ end
34
+
35
+ # An item with the dot at the beginning is called
36
+ # predicted item
37
+ alias :predicted_item? :at_start?
38
+
39
+ # A dotted item is called a reduce item if the dot is at the end.
40
+ def reduce_item?()
41
+ return position < 0 # Either -1 or -2
42
+ end
43
+
44
+ # The non-terminal symbol that is on the left-side of the production
45
+ def lhs()
46
+ return production.lhs
47
+ end
48
+
49
+ # Return the symbol after the dot.
50
+ # nil is returned if the dot is at the end
51
+ def next_symbol()
52
+ result = (position < 0) ? nil : production.rhs[position]
53
+ end
54
+
55
+ # An item with the dot in front of a terminal is called a shift item
56
+ def shift_item?()
57
+ end
58
+
59
+ private
60
+
61
+ # Return the given after its validation.
62
+ def valid_position(aPosition)
63
+ rhs_size = production.rhs.size
64
+ if aPosition < 0 || aPosition > rhs_size
65
+ fail StandardError, 'Out of bound index'
66
+ end
67
+
68
+ if rhs_size == 0
69
+ index = -2 # Minus 2 at start/end of empty production
70
+ elsif aPosition == rhs_size
71
+ index = -1 # Minus 1 at end of non-empty production
72
+ else
73
+ index = aPosition
74
+ end
75
+
76
+ return index
77
+ end
78
+ end # class
79
+
80
+ # End of file
@@ -0,0 +1,177 @@
1
+ require_relative '../syntax/grammar'
2
+ require_relative 'dotted_item'
3
+ require_relative 'parsing'
4
+
5
+ module Rley # This module is used as a namespace
6
+ module Parser # This module is used as a namespace
7
+
8
+ # Implementation of a parser that uses the Earley parsing algorithm.
9
+ class EarleyParser
10
+ # The grammar of the language.
11
+ attr_reader(:grammar)
12
+
13
+ # The dotted items/rules for the productions of the grammar
14
+ attr_reader(:dotted_items)
15
+
16
+ # A Hash that defines the mapping: non-terminal => [start dotted items]
17
+ attr_reader(:start_mapping)
18
+
19
+ # A Hash that defines the mapping: dotted item => next dotted item
20
+ # In other words, the 'next_mapping' allows to find the dotted item
21
+ # after "advancing" the dot
22
+ attr_reader(:next_mapping)
23
+
24
+
25
+ def initialize(aGrammar)
26
+ @grammar = aGrammar
27
+ @dotted_items = build_dotted_items(grammar)
28
+ @start_mapping = build_start_mapping(dotted_items)
29
+ @next_mapping = build_next_mapping(dotted_items)
30
+ end
31
+
32
+ def parse(aTokenSequence)
33
+ result = Parsing.new(start_dotted_item, aTokenSequence)
34
+
35
+ (0..aTokenSequence.size).each do |i|
36
+ result.chart[i].each do |state|
37
+ if state.complete?
38
+ # parse reached end of production
39
+ completion(result, state, i)
40
+ else
41
+ next_symbol = state.next_symbol
42
+ if next_symbol.kind_of?(Syntax::NonTerminal)
43
+ prediction(result, next_symbol, i)
44
+ else
45
+ # Expecting a terminal symbol
46
+ scanning(result, next_symbol, i)
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+ return result
53
+ end
54
+
55
+ private
56
+
57
+ def build_dotted_items(aGrammar)
58
+ items = []
59
+ aGrammar.rules.each do |prod|
60
+ rhs_size = prod.rhs.size
61
+ if rhs_size == 0
62
+ items << DottemItem.new(prod, 0)
63
+ else
64
+ items += (0..rhs_size).map { |i| DottedItem.new(prod, i) }
65
+ end
66
+ end
67
+
68
+ return items
69
+ end
70
+
71
+ # Create a Hash with pairs of the kind:
72
+ # non-terminal => [start dotted items]
73
+ def build_start_mapping(theDottedItems)
74
+ mapping = {}
75
+ theDottedItems.each do |item|
76
+ next unless item.at_start?
77
+
78
+ lhs_symbol = item.lhs
79
+ map_entry = mapping.fetch(lhs_symbol, [])
80
+ map_entry << item
81
+ mapping[lhs_symbol] = map_entry
82
+ end
83
+
84
+ return mapping
85
+ end
86
+
87
+ # Create a Hash with pairs of the kind:
88
+ # dotted item => next dotted item
89
+ # next dotted item uses same production and the dot
90
+ # position is advanced by one symbol
91
+ def build_next_mapping(theDottedItems)
92
+ mapping = {}
93
+ theDottedItems.each_cons(2) do |(item1, item2)|
94
+ next if item1.production != item2.production
95
+ mapping[item1] = item2
96
+ end
97
+
98
+ return mapping
99
+ end
100
+
101
+ # The dotted item for the start production and
102
+ # with the dot at the beginning of the rhs
103
+ def start_dotted_item()
104
+ # TODO: remove assumption that first dotted_item is
105
+ # for start production
106
+ return dotted_items[0]
107
+ end
108
+
109
+
110
+ # This method is called when a parse state for chart entry at position
111
+ # 'pos' expects as next symbol a non-terminal.
112
+ # Given a predicted non-terminal 'nt' and a current token position
113
+ # 'pos':
114
+ # For each production with 'nt' as lhs, retrieve their corresponding
115
+ # initial dotted rules nt -> . xxxx
116
+ # For retrieved dotted rule, add a parse state to the chart entry at 'pos':
117
+ # <initial dotted rule, pos, pos>
118
+ # In short, one adds states to chart[pos], one per production that
119
+ # specifies how to reduce some input into the predicted nt (non-terminal)
120
+ # A prediction corresponds to a potential expansion of a nonterminal
121
+ # in a left-most derivation.
122
+ # @param aParsing [Parsing] the object that encapsulates the results
123
+ # result of the parsing process
124
+ # @param aNonTerminal [NonTerminal] a non-terminal symbol that
125
+ # immediately follows a dot
126
+ # (= is expected/predicted by the production rule)
127
+ # @param aPosition [Fixnum] position in the input token sequence.
128
+ def prediction(aParsing, aNonTerminal, aPosition)
129
+ # Retrieve all start dotted items for productions
130
+ # with aNonTerminal as its lhs
131
+ items = start_mapping[aNonTerminal]
132
+ items.each do |an_item|
133
+ aParsing.push_state(an_item, aPosition, aPosition)
134
+ end
135
+ end
136
+
137
+ # This method is called when a parse state for chart entry at position
138
+ # 'pos' expects a terminal as next symbol.
139
+ # If the input token matches the terminal symbol then:
140
+ # Retrieve all parse states for chart entry at 'aPosition'
141
+ # that have the given terminal as next symbol.
142
+ # For each s of the above states, push to chart entry aPosition + 1
143
+ # a new state like: <next dotted rule, s.origin, aPosition + 1>
144
+ # In other words, we place the dotted rules in the next state set
145
+ # such that the dot appears after terminal.
146
+ # @param aParsing [Parsing] the object that encapsulates the results
147
+ # result of the parsing process
148
+ # @param Terminal [Terminal] a terminal symbol that
149
+ # immediately follows a dot
150
+ # @param aPosition [Fixnum] position in the input token sequence.
151
+ def scanning(aParsing, aTerminal, aPosition)
152
+ aParsing.scanning(aTerminal, aPosition) { |item|
153
+ next_mapping[item]
154
+ }
155
+ end
156
+
157
+
158
+ # This method is called when a parse state at chart entry reaches the end
159
+ # of a production.
160
+ # For every state in chart[aPosition] that is complete (i.e. of the form:
161
+ # { dotted_rule: X -> γ •, origin: j}),
162
+ # Find states s in chart[j] of the form {dotted_rule: Y -> α • X β, origin: i}
163
+ # In other words, rules that predicted the non-terminal X.
164
+ # For each s, add to chart[aPosition] a state of the form
165
+ # { dotted_rule: Y → α X • β, origin: i})
166
+ def completion(aParsing, aState, aPosition)
167
+ aParsing.completion(aState, aPosition) { |item|
168
+ next_mapping[item]
169
+ }
170
+ end
171
+
172
+ end # class
173
+
174
+ end # module
175
+ end # module
176
+
177
+ # End of file