odin 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. data/.gitignore +19 -0
  2. data/.rvmrc +1 -0
  3. data/.travis.yml +2 -0
  4. data/Gemfile +4 -0
  5. data/Gemfile.lock +26 -0
  6. data/HISTORY.md +102 -0
  7. data/LICENSE.md +10 -0
  8. data/README.md +46 -0
  9. data/Rakefile +69 -0
  10. data/app/controllers/grammar_checker.rb +51 -0
  11. data/check_grammar.rb +24 -0
  12. data/configure +9 -0
  13. data/images/atn_diagram.graffle +0 -0
  14. data/images/atn_diagram.pdf +0 -0
  15. data/images/odin-ff6.gif +0 -0
  16. data/lang/en/adjectives.rb +388 -0
  17. data/lang/en/atn.rb +102 -0
  18. data/lang/en/closed_class_words.rb +206 -0
  19. data/lang/en/data.rb +1086 -0
  20. data/lang/en/noun_inflections.rb +76 -0
  21. data/lang/en/noun_inflector_test_cases.rb +235 -0
  22. data/lang/en/pronoun_inflector_test_cases.rb +14 -0
  23. data/lang/en/verbs.rb +648 -0
  24. data/lang/iso639.rb +405 -0
  25. data/lib/array.rb +15 -0
  26. data/lib/atn.rb +82 -0
  27. data/lib/augmented_transition_network.rb +146 -0
  28. data/lib/dumper.rb +44 -0
  29. data/lib/noun_inflector.rb +283 -0
  30. data/lib/odin.rb +3 -0
  31. data/lib/odin/version.rb +3 -0
  32. data/lib/parts_of_speech.rb +402 -0
  33. data/lib/star.rb +23 -0
  34. data/lib/string.rb +99 -0
  35. data/lib/string_bracketing.rb +100 -0
  36. data/lib/word.rb +69 -0
  37. data/lib/word_net.rb +265 -0
  38. data/odin.gemspec +27 -0
  39. data/simple_atn/README.md +45 -0
  40. data/simple_atn/Rakefile +9 -0
  41. data/simple_atn/array.rb +15 -0
  42. data/simple_atn/augmented_transition_network.rb +146 -0
  43. data/simple_atn/augmented_transition_network_test.rb +113 -0
  44. data/simple_atn/english.rb +161 -0
  45. data/simple_atn/string.rb +63 -0
  46. data/test/fixtures/alice.txt +3594 -0
  47. data/test/fixtures/art.txt +7 -0
  48. data/test/fixtures/both.txt +1 -0
  49. data/test/fixtures/existing.txt +0 -0
  50. data/test/fixtures/existing.txt.checked.html +0 -0
  51. data/test/fixtures/grammar_checker.css +4 -0
  52. data/test/fixtures/grammatical.txt +1 -0
  53. data/test/fixtures/ungrammatical.txt +1 -0
  54. data/test/functional/grammar_checker_test.rb +64 -0
  55. data/test/integration/en/word_and_noun_inflector_test.rb +29 -0
  56. data/test/test_helper.rb +82 -0
  57. data/test/unit/atn_test.rb +240 -0
  58. data/test/unit/noun_inflector_test.rb +249 -0
  59. data/test/unit/pronoun_inflector_test.rb +17 -0
  60. data/test/unit/star_test.rb +24 -0
  61. data/test/unit/string_bracketing_test_module.rb +70 -0
  62. data/test/unit/string_test.rb +92 -0
  63. data/test/unit/word_test.rb +15 -0
  64. metadata +223 -0
@@ -0,0 +1,146 @@
1
+ require File.dirname(__FILE__) + '/string'
2
+ require File.dirname(__FILE__) + '/array'
3
+
4
+ class Ungrammatical < Exception; end
5
+
6
+ class AugmentedTransitionNetwork
7
+ def initialize(language = :en)
8
+ if :en == language
9
+ require File.dirname(__FILE__) + '/english'
10
+ extend English
11
+ end
12
+ clear!
13
+ end
14
+
15
+ def parse(words, start_node = :sentence)
16
+ clear!
17
+ @words = words.dup
18
+ @words.freeze
19
+ send(start_node, 0, Hash.new)
20
+ # The result for the network traversal is located in @star.
21
+ return @star
22
+ end
23
+
24
+ def parse_to_string(words, start_node = :sentence)
25
+ parsed = parse(words, start_node)
26
+ return parsed.inspect.matches_for(/".*?"/).join(" ").gsub("\"", '')
27
+ end
28
+
29
+ private
30
+ def clear!
31
+ @star = nil
32
+ @words = []
33
+ end
34
+
35
+ # Tag a word or phrase with a functional role.
36
+ #
37
+ # For example, a single word may be labeled :noun.
38
+ # A phrase (multiple words) may be labeled :noun_phrase. (Note that each constituent of a phrase
39
+ # should have a tag as well.)
40
+ def tag(marker, constituents)
41
+ # TODO Tag in a different way? I have to call .last to get the real word...
42
+ tagged = [marker]
43
+
44
+ constituents.each do |constituent|
45
+ unless constituent.nil?
46
+ # if there's nothing in the register, etc, the value will be nil
47
+ # don't include the nil in the tagging
48
+ tagged << constituent
49
+ end
50
+ end
51
+
52
+ return tagged
53
+ end
54
+
55
+ # TODO
56
+ # def choose_arc(arcs, position, registers)
57
+ # arcs.each do |arc|
58
+ # begin
59
+ # arc.call(position, registers)
60
+ # rescue Ungrammatical
61
+ # # Move onto the next one
62
+ # end
63
+ # end
64
+ #
65
+ # raise Ungrammatical
66
+ # end
67
+
68
+ # Set a given register in the hash given as an argument. The value that gets assigned
69
+ # to the key is specified in the optional 'extras' hash. By default, the tag is the same
70
+ # as the destination register (register_name) and the content is the word at the given
71
+ # position.
72
+ def set_register(register_name, position, registers, extras = {})
73
+ # TODO I'm pretty sure there's an easier way to handle the argument hash
74
+ if extras[:tag]
75
+ tag = extras[:tag]
76
+ else
77
+ tag = register_name
78
+ end
79
+
80
+ if extras[:content]
81
+ content = extras[:content]
82
+ else
83
+ content = @words[position]
84
+ end
85
+
86
+ registers[register_name] = tag(tag, content)
87
+ end
88
+
89
+ def at_last_word?(position)
90
+ # puts("in at_last_word?")
91
+ if !@words[position].nil?
92
+ # puts("failing...")
93
+ raise Ungrammatical
94
+ else
95
+ return @words.length == position
96
+ end
97
+ end
98
+
99
+ def in_category?(category, position)
100
+ word = @words[position]
101
+ return (!word.nil? and word.send("#{category}?"))
102
+ end
103
+
104
+ def exact_word?(exact_word, position)
105
+ word = @words[position]
106
+ if word.nil? # if we're checking for a position outside the length of @words
107
+ raise Ungrammatical
108
+ else
109
+ return word == exact_word
110
+ end
111
+ # word = @words.at(position)
112
+ # return (!word.nil? and word == exact_word)
113
+ end
114
+
115
+ def follow_arc_to(node_name, position, registers)
116
+ send(node_name, position + 1, registers.dup)
117
+ end
118
+
119
+ def jump_to(node_name, position, registers)
120
+ send(node_name, position, registers.dup)
121
+ end
122
+
123
+ def push(node_name, position, registers, extras)
124
+ # TODO I'm pretty sure there's an easier way to handle the argument hash
125
+ if extras[:into].nil? or extras[:next].nil?
126
+ raise "You must give :into and :next for the 'extra' hash"
127
+ end
128
+
129
+ destination_register = extras[:into]
130
+ next_node = extras[:next]
131
+
132
+ # Traverse the subnetwork...
133
+ send(node_name, position, registers.dup)
134
+
135
+ # The result for the subnetwork traversal is located in @star.
136
+ registers[destination_register] = @star.dup
137
+ position += registers[destination_register].inspect.number_in_quotes
138
+
139
+ # Move along to the next node
140
+ send(next_node, position, registers.dup)
141
+ end
142
+
143
+ def pop(content)
144
+ @star = content
145
+ end
146
+ end
@@ -0,0 +1,44 @@
1
+ module Dumper
2
+ private
3
+ def heading(string)
4
+ length = string.length + 5
5
+
6
+ puts "=" * length
7
+ puts string.upcase
8
+ puts "=" * length
9
+ yield
10
+ puts "=" * length
11
+ puts
12
+ end
13
+
14
+ def section(string)
15
+ puts string
16
+ puts "-" * (string.length + 2)
17
+ yield
18
+ puts
19
+ end
20
+
21
+ def indent(string, length = 2, character = " ")
22
+ output = ""
23
+
24
+ string.each_line do |line|
25
+ output << (character * length) + line
26
+ end
27
+
28
+ return output
29
+ end
30
+
31
+ def inspect_tree(tree)
32
+ output = ""
33
+
34
+ tree.each do |branch|
35
+ if branch.class.to_s == "Array" # TODO better way?
36
+ output << indent(inspect_tree(branch))
37
+ else
38
+ output << "#{branch.inspect}\n"
39
+ end
40
+ end
41
+
42
+ return output
43
+ end
44
+ end
@@ -0,0 +1,283 @@
1
+ # From Rails
2
+ require 'singleton'
3
+
4
+ # The NounInflector transforms words from singular to plural, class names to table names, modularized class names to ones without,
5
+ # and class names to foreign keys. The default inflections for pluralization, singularization, and uncountable words are kept
6
+ # in inflections.rb.
7
+ module NounInflector
8
+ # A singleton instance of this class is yielded by NounInflector.inflections, which can then be used to specify additional
9
+ # inflection rules. Examples:
10
+ #
11
+ # Inflector.inflections do |inflect|
12
+ # inflect.plural /^(ox)$/i, '\1\2en'
13
+ # inflect.singular /^(ox)en/i, '\1'
14
+ #
15
+ # inflect.irregular 'octopus', 'octopi'
16
+ #
17
+ # inflect.uncountable "equipment"
18
+ # end
19
+ #
20
+ # New rules are added at the top. So in the example above, the irregular rule for octopus will now be the first of the
21
+ # pluralization and singularization rules that is runs. This guarantees that your rules run before any of the rules that may
22
+ # already have been loaded.
23
+ class Inflections
24
+ include Singleton
25
+
26
+ attr_reader :plurals, :singulars, :uncountables
27
+
28
+ def initialize
29
+ @plurals, @singulars, @uncountables = [], [], []
30
+ end
31
+
32
+ # Specifies a new pluralization rule and its replacement. The rule can either be a string or a regular expression.
33
+ # The replacement should always be a string that may include references to the matched data from the rule.
34
+ def plural(rule, replacement)
35
+ @plurals.insert(0, [rule, replacement])
36
+ end
37
+
38
+ # Specifies a new singularization rule and its replacement. The rule can either be a string or a regular expression.
39
+ # The replacement should always be a string that may include references to the matched data from the rule.
40
+ def singular(rule, replacement)
41
+ @singulars.insert(0, [rule, replacement])
42
+ end
43
+
44
+ # Specifies a new irregular that applies to both pluralization and singularization at the same time. This can only be used
45
+ # for strings, not regular expressions. You simply pass the irregular in singular and plural form.
46
+ #
47
+ # Examples:
48
+ # irregular 'octopus', 'octopi'
49
+ # irregular 'person', 'people'
50
+ def irregular(singular, plural)
51
+ if singular[0,1].upcase == plural[0,1].upcase
52
+ plural(Regexp.new("(#{singular[0,1]})#{singular[1..-1]}$", "i"), '\1' + plural[1..-1])
53
+ singular(Regexp.new("(#{plural[0,1]})#{plural[1..-1]}$", "i"), '\1' + singular[1..-1])
54
+ else
55
+ plural(Regexp.new("#{singular[0,1].upcase}(?i)#{singular[1..-1]}$"), plural[0,1].upcase + plural[1..-1])
56
+ plural(Regexp.new("#{singular[0,1].downcase}(?i)#{singular[1..-1]}$"), plural[0,1].downcase + plural[1..-1])
57
+ singular(Regexp.new("#{plural[0,1].upcase}(?i)#{plural[1..-1]}$"), singular[0,1].upcase + singular[1..-1])
58
+ singular(Regexp.new("#{plural[0,1].downcase}(?i)#{plural[1..-1]}$"), singular[0,1].downcase + singular[1..-1])
59
+ end
60
+ end
61
+
62
+ # Add uncountable words that shouldn't be attempted inflected.
63
+ #
64
+ # Examples:
65
+ # uncountable "money"
66
+ # uncountable "money", "information"
67
+ # uncountable %w( money information rice )
68
+ def uncountable(*words)
69
+ (@uncountables << words).flatten!
70
+ end
71
+
72
+ # Clears the loaded inflections within a given scope (default is :all). Give the scope as a symbol of the inflection type,
73
+ # the options are: :plurals, :singulars, :uncountables
74
+ #
75
+ # Examples:
76
+ # clear :all
77
+ # clear :plurals
78
+ def clear(scope = :all)
79
+ case scope
80
+ when :all
81
+ @plurals, @singulars, @uncountables = [], [], []
82
+ else
83
+ instance_variable_set "@#{scope}", []
84
+ end
85
+ end
86
+ end
87
+
88
+ extend self
89
+
90
+ def inflections
91
+ if block_given?
92
+ yield Inflections.instance
93
+ else
94
+ Inflections.instance
95
+ end
96
+ end
97
+
98
+ # Returns the plural form of the word in the string.
99
+ #
100
+ # Examples
101
+ # "post".pluralize #=> "posts"
102
+ # "octopus".pluralize #=> "octopi"
103
+ # "sheep".pluralize #=> "sheep"
104
+ # "words".pluralize #=> "words"
105
+ # "the blue mailman".pluralize #=> "the blue mailmen"
106
+ # "CamelOctopus".pluralize #=> "CamelOctopi"
107
+ def pluralize(word)
108
+ result = word.to_s.dup
109
+
110
+ if word.empty? || inflections.uncountables.include?(result.downcase)
111
+ result
112
+ else
113
+ inflections.plurals.each { |(rule, replacement)| break if result.gsub!(rule, replacement) }
114
+ result
115
+ end
116
+ end
117
+
118
+ # The reverse of pluralize, returns the singular form of a word in a string.
119
+ #
120
+ # Examples
121
+ # "posts".singularize #=> "post"
122
+ # "octopi".singularize #=> "octopus"
123
+ # "sheep".singluarize #=> "sheep"
124
+ # "word".singluarize #=> "word"
125
+ # "the blue mailmen".singularize #=> "the blue mailman"
126
+ # "CamelOctopi".singularize #=> "CamelOctopus"
127
+ def singularize(word)
128
+ result = word.to_s.dup
129
+
130
+ if inflections.uncountables.include?(result.downcase)
131
+ result
132
+ else
133
+ inflections.singulars.each { |(rule, replacement)| break if result.gsub!(rule, replacement) }
134
+ result
135
+ end
136
+ end
137
+
138
+ # By default, camelize converts strings to UpperCamelCase. If the argument to camelize
139
+ # is set to ":lower" then camelize produces lowerCamelCase.
140
+ #
141
+ # camelize will also convert '/' to '::' which is useful for converting paths to namespaces
142
+ #
143
+ # Examples
144
+ # "active_record".camelize #=> "ActiveRecord"
145
+ # "active_record".camelize(:lower) #=> "activeRecord"
146
+ # "active_record/errors".camelize #=> "ActiveRecord::Errors"
147
+ # "active_record/errors".camelize(:lower) #=> "activeRecord::Errors"
148
+ def camelize(lower_case_and_underscored_word, first_letter_in_uppercase = true)
149
+ if first_letter_in_uppercase
150
+ lower_case_and_underscored_word.to_s.gsub(/\/(.?)/) { "::" + $1.upcase }.gsub(/(^|_)(.)/) { $2.upcase }
151
+ else
152
+ lower_case_and_underscored_word.first + camelize(lower_case_and_underscored_word)[1..-1]
153
+ end
154
+ end
155
+
156
+ # Capitalizes all the words and replaces some characters in the string to create
157
+ # a nicer looking title. Titleize is meant for creating pretty output. It is not
158
+ # used in the Rails internals.
159
+ #
160
+ # titleize is also aliased as as titlecase
161
+ #
162
+ # Examples
163
+ # "man from the boondocks".titleize #=> "Man From The Boondocks"
164
+ # "x-men: the last stand".titleize #=> "X Men: The Last Stand"
165
+ def titleize(word)
166
+ humanize(underscore(word)).gsub(/\b([a-z])/) { $1.capitalize }
167
+ end
168
+
169
+ # The reverse of +camelize+. Makes an underscored form from the expression in the string.
170
+ #
171
+ # Changes '::' to '/' to convert namespaces to paths.
172
+ #
173
+ # Examples
174
+ # "ActiveRecord".underscore #=> "active_record"
175
+ # "ActiveRecord::Errors".underscore #=> active_record/errors
176
+ def underscore(camel_cased_word)
177
+ camel_cased_word.to_s.gsub(/::/, '/').
178
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
179
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
180
+ tr("-", "_").
181
+ downcase
182
+ end
183
+
184
+ # Replaces underscores with dashes in the string.
185
+ #
186
+ # Example
187
+ # "puni_puni" #=> "puni-puni"
188
+ def dasherize(underscored_word)
189
+ underscored_word.gsub(/_/, '-')
190
+ end
191
+
192
+ # Capitalizes the first word and turns underscores into spaces and strips _id.
193
+ # Like titleize, this is meant for creating pretty output.
194
+ #
195
+ # Examples
196
+ # "employee_salary" #=> "Employee salary"
197
+ # "author_id" #=> "Author"
198
+ def humanize(lower_case_and_underscored_word)
199
+ lower_case_and_underscored_word.to_s.gsub(/_id$/, "").gsub(/_/, " ").capitalize
200
+ end
201
+
202
+ # Removes the module part from the expression in the string
203
+ #
204
+ # Examples
205
+ # "ActiveRecord::CoreExtensions::String::Inflections".demodulize #=> "Inflections"
206
+ # "Inflections".demodulize #=> "Inflections"
207
+ def demodulize(class_name_in_module)
208
+ class_name_in_module.to_s.gsub(/^.*::/, '')
209
+ end
210
+
211
+ # Create the name of a table like Rails does for models to table names. This method
212
+ # uses the pluralize method on the last word in the string.
213
+ #
214
+ # Examples
215
+ # "RawScaledScorer".tableize #=> "raw_scaled_scorers"
216
+ # "egg_and_ham".tableize #=> "egg_and_hams"
217
+ # "fancyCategory".tableize #=> "fancy_categories"
218
+ def tableize(class_name)
219
+ pluralize(underscore(class_name))
220
+ end
221
+
222
+ # Create a class name from a table name like Rails does for table names to models.
223
+ # Note that this returns a string and not a Class. (To convert to an actual class
224
+ # follow classify with constantize.)
225
+ #
226
+ # Examples
227
+ # "egg_and_hams".classify #=> "EggAndHam"
228
+ # "post".classify #=> "Post"
229
+ def classify(table_name)
230
+ # strip out any leading schema name
231
+ camelize(singularize(table_name.to_s.sub(/.*\./, '')))
232
+ end
233
+
234
+ # Creates a foreign key name from a class name.
235
+ # +separate_class_name_and_id_with_underscore+ sets whether
236
+ # the method should put '_' between the name and 'id'.
237
+ #
238
+ # Examples
239
+ # "Message".foreign_key #=> "message_id"
240
+ # "Message".foreign_key(false) #=> "messageid"
241
+ # "Admin::Post".foreign_key #=> "post_id"
242
+ def foreign_key(class_name, separate_class_name_and_id_with_underscore = true)
243
+ underscore(demodulize(class_name)) + (separate_class_name_and_id_with_underscore ? "_id" : "id")
244
+ end
245
+
246
+ # Constantize tries to find a declared constant with the name specified
247
+ # in the string. It raises a NameError when the name is not in CamelCase
248
+ # or is not initialized.
249
+ #
250
+ # Examples
251
+ # "Module".constantize #=> Module
252
+ # "Class".constantize #=> Class
253
+ def constantize(camel_cased_word)
254
+ unless /\A(?:::)?([A-Z]\w*(?:::[A-Z]\w*)*)\z/ =~ camel_cased_word
255
+ raise NameError, "#{camel_cased_word.inspect} is not a valid constant name!"
256
+ end
257
+
258
+ Object.module_eval("::#{$1}", __FILE__, __LINE__)
259
+ end
260
+
261
+ # Ordinalize turns a number into an ordinal string used to denote the
262
+ # position in an ordered sequence such as 1st, 2nd, 3rd, 4th.
263
+ #
264
+ # Examples
265
+ # ordinalize(1) # => "1st"
266
+ # ordinalize(2) # => "2nd"
267
+ # ordinalize(1002) # => "1002nd"
268
+ # ordinalize(1003) # => "1003rd"
269
+ def ordinalize(number)
270
+ if (11..13).include?(number.to_i % 100)
271
+ "#{number}th"
272
+ else
273
+ case number.to_i % 10
274
+ when 1; "#{number}st"
275
+ when 2; "#{number}nd"
276
+ when 3; "#{number}rd"
277
+ else "#{number}th"
278
+ end
279
+ end
280
+ end
281
+ end
282
+
283
+ require File.dirname(__FILE__) + '/../lang/en/noun_inflections'