pragmatic_tokenizer 1.4.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +184 -0
  3. data/.rubocop_todo.yml +66 -0
  4. data/README.md +0 -7
  5. data/Rakefile +1 -1
  6. data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
  7. data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
  8. data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
  9. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
  10. data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
  11. data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
  12. data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
  13. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
  15. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  16. data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
  17. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
  19. data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  22. data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
  23. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  25. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  26. data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
  27. data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
  28. data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
  29. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  30. data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
  31. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  32. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  33. data/lib/pragmatic_tokenizer/languages.rb +28 -28
  34. data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
  35. data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
  36. data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
  37. data/lib/pragmatic_tokenizer/version.rb +1 -1
  38. data/pragmatic_tokenizer.gemspec +1 -0
  39. data/spec/languages/bulgarian_spec.rb +17 -13
  40. data/spec/languages/deutsch_spec.rb +110 -86
  41. data/spec/languages/english_spec.rb +465 -342
  42. data/spec/languages/french_spec.rb +3 -2
  43. data/spec/performance_spec.rb +7 -7
  44. data/spec/pragmatic_tokenizer_spec.rb +8 -8
  45. metadata +18 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 46da26b8afc38bfc2699a2875d9786be47702160
4
- data.tar.gz: 9073a0505244bb97d7fa51c48e9fe4ab30983c90
3
+ metadata.gz: 41ba3758c8ff32d83451b66e0b28ca8f33248843
4
+ data.tar.gz: 1f96d2fbf5a7a66a3031631b58a9b91b77991fc5
5
5
  SHA512:
6
- metadata.gz: 8397bca6ada5fae51d1894b26154b5f9fed73e375b9e78b2803da892c20e5f32efb2722f0fe767cbc171586a5a6979a720041de5f0fd5c50824a0498c55b8394
7
- data.tar.gz: f58cae264490ce16bef8e5512760e992dd38a11ced8fa872ab5faff2e92f61e6414f7e9ffd732f1e2910de560b2e9a2e98e1d59aa7039426ee0fbf3d8daaaa1f
6
+ metadata.gz: 7e62958fbf69b55d62c00e391a9c6ed8da4c55c0336e0160c172363198d3a1a711dbdc6310a94e54a9a65316b9c34913a886b92ee7eb6afe78530556a185663b
7
+ data.tar.gz: 9fbb6d481494ef235fd1d74b9c34833bb96887c29d530c28add6ee3e5109379b051f2a7520208f9385d686e0ec2ba8c655c849dfb43a6d7b850c3ec0092b526f
data/.rubocop.yml ADDED
@@ -0,0 +1,184 @@
1
+ #inherit_from: .rubocop_todo.yml
2
+
3
+ # check https://gist.github.com/jhass/a5ae80d87f18e53e7b56/84972ca6c0c5a59768ae07ee0a639d03cc72d375
4
+
5
+ Metrics/LineLength:
6
+ Max: 99
7
+ Enabled: false # as it complains about long comments too
8
+
9
+ Style/EmptyLinesAroundClassBody:
10
+ Enabled: false
11
+
12
+ Style/EmptyLinesAroundModuleBody:
13
+ Enabled: false
14
+
15
+ Style/AsciiComments:
16
+ Enabled: false
17
+
18
+ Style/ExtraSpacing:
19
+ AllowForAlignment: true
20
+
21
+ Style/IndentationConsistency:
22
+ EnforcedStyle: rails
23
+
24
+ Style/AndOr:
25
+ Enabled: false # complains about "redirect_to ... and return" in Rails controllers
26
+
27
+ # Fail is an alias of raise. Avoid aliases, it's more cognitive load for no gain.
28
+ # The argument that fail should be used to abort the program is wrong too,
29
+ # there's Kernel#abort for that.
30
+ Style/SignalException:
31
+ EnforcedStyle: only_raise
32
+
33
+ Style/StringLiterals:
34
+ Enabled: false
35
+
36
+ # allow Rubymine "Continuation indent" of 4 characters:
37
+ Style/CaseIndentation:
38
+ IndentationWidth: 4
39
+ Style/FirstParameterIndentation:
40
+ IndentationWidth: 4
41
+ Style/IndentArray:
42
+ IndentationWidth: 4
43
+ Style/IndentAssignment:
44
+ IndentationWidth: 4
45
+ Style/IndentHash:
46
+ IndentationWidth: 4
47
+
48
+ Style/MultilineMethodCallIndentation:
49
+ EnforcedStyle: indented
50
+ IndentationWidth: 4
51
+
52
+ # unsure how we'd want it
53
+ #Style/MultilineOperationIndentation:
54
+ # IndentationWidth: 4
55
+
56
+ # these are not continued (e.g. 'private'), so we keep them at 2 chars
57
+ Style/AccessModifierIndentation:
58
+ IndentationWidth: 2
59
+
60
+ Style/SpaceAroundOperators:
61
+ AllowForAlignment: true
62
+
63
+ # Most readable form.
64
+ Style/AlignHash:
65
+ EnforcedHashRocketStyle: table
66
+ EnforcedColonStyle: table
67
+
68
+ # Mixing the styles looks just silly.
69
+ Style/HashSyntax:
70
+ EnforcedStyle: ruby19_no_mixed_keys
71
+
72
+ # consider this to keep method parameters short
73
+ # No space makes the method definition shorter and differentiates
74
+ # from a regular assignment.
75
+ Style/SpaceAroundEqualsInParameterDefault:
76
+ EnforcedStyle: no_space
77
+
78
+ Rails:
79
+ Enabled: false
80
+
81
+ AllCops:
82
+ Include:
83
+ - '**/Rakefile'
84
+ Exclude:
85
+ - 'bin/**/*'
86
+ TargetRubyVersion:
87
+ 2.2
88
+
89
+ # These are all the cops that are disabled in the default configuration.
90
+
91
+ Style/AutoResourceCleanup:
92
+ Description: 'Suggests the usage of an auto resource cleanup version of a method (if available).'
93
+ Enabled: true
94
+
95
+ Style/CollectionMethods:
96
+ Description: 'Preferred collection methods.'
97
+ StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#map-find-select-reduce-size'
98
+ Enabled: true
99
+
100
+ #Style/Encoding:
101
+ # Description: 'Use UTF-8 as the source file encoding.'
102
+ # StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#utf-8'
103
+ # Enabled: true
104
+
105
+ #Style/InlineComment:
106
+ # Description: 'Avoid inline comments.'
107
+ # Enabled: true
108
+
109
+ Style/FirstArrayElementLineBreak:
110
+ Description: >-
111
+ Checks for a line break before the first element in a
112
+ multi-line array.
113
+ Enabled: true
114
+
115
+ Style/FirstHashElementLineBreak:
116
+ Description: >-
117
+ Checks for a line break before the first element in a
118
+ multi-line hash.
119
+ Enabled: true
120
+
121
+ Style/FirstMethodArgumentLineBreak:
122
+ Description: >-
123
+ Checks for a line break before the first argument in a
124
+ multi-line method call.
125
+ Enabled: true
126
+
127
+ Style/FirstMethodParameterLineBreak:
128
+ Description: >-
129
+ Checks for a line break before the first parameter in a
130
+ multi-line method parameter definition.
131
+ Enabled: true
132
+
133
+ Style/MethodCalledOnDoEndBlock:
134
+ Description: 'Avoid chaining a method call on a do...end block.'
135
+ StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#single-line-blocks'
136
+ Enabled: true
137
+
138
+ Style/MissingElse:
139
+ Description: >-
140
+ Require if/case expressions to have an else branches.
141
+ If enabled, it is recommended that
142
+ Style/UnlessElse and Style/EmptyElse be enabled.
143
+ This will conflict with Style/EmptyElse if
144
+ Style/EmptyElse is configured to style "both"
145
+ Enabled: false
146
+ EnforcedStyle: both
147
+ SupportedStyles:
148
+ # if - warn when an if expression is missing an else branch
149
+ # case - warn when a case expression is missing an else branch
150
+ # both - warn when an if or case expression is missing an else branch
151
+ - if
152
+ - case
153
+ - both
154
+
155
+ #Style/MultilineAssignmentLayout:
156
+ # Description: 'Check for a newline after the assignment operator in multi-line assignments.'
157
+ # StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#indent-conditional-assignment'
158
+ # Enabled: true
159
+
160
+ Style/MutableConstant:
161
+ Description: 'Do not assign mutable objects to constants.'
162
+ Enabled: true
163
+
164
+ Style/OptionHash:
165
+ Description: "Don't use option hashes when you can use keyword arguments."
166
+ Enabled: true
167
+
168
+ Style/Send:
169
+ Description: 'Prefer `Object#__send__` or `Object#public_send` to `send`, as `send` may overlap with existing methods.'
170
+ StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#prefer-public-send'
171
+ Enabled: true
172
+
173
+ Style/StringMethods:
174
+ Description: 'Checks if configured preferred methods are used over non-preferred.'
175
+ Enabled: true
176
+
177
+ #Style/SymbolArray:
178
+ # Description: 'Use %i or %I for arrays of symbols.'
179
+ # StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#percent-i'
180
+ # Enabled: true
181
+
182
+ Lint/LiteralInInterpolation:
183
+ Description: 'Avoid interpolating literals in strings'
184
+ AutoCorrect: true
data/.rubocop_todo.yml ADDED
@@ -0,0 +1,66 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2016-01-23 03:18:41 +0100 using RuboCop version 0.36.0.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 11
10
+ Metrics/AbcSize:
11
+ Max: 137
12
+
13
+ # Offense count: 2
14
+ # Configuration parameters: CountComments.
15
+ Metrics/ClassLength:
16
+ Max: 214
17
+
18
+ # Offense count: 9
19
+ Metrics/CyclomaticComplexity:
20
+ Max: 41
21
+
22
+ # Offense count: 8
23
+ # Configuration parameters: CountComments.
24
+ Metrics/MethodLength:
25
+ Max: 57
26
+
27
+ # Offense count: 1
28
+ # Configuration parameters: CountComments.
29
+ Metrics/ModuleLength:
30
+ Max: 134
31
+
32
+ # Offense count: 6
33
+ Metrics/PerceivedComplexity:
34
+ Max: 43
35
+
36
+ # Offense count: 4
37
+ # Cop supports --auto-correct.
38
+ Style/CommentIndentation:
39
+ Exclude:
40
+ - 'lib/pragmatic_tokenizer/tokenizer.rb'
41
+
42
+ # Offense count: 31
43
+ Style/Documentation:
44
+ Enabled: false
45
+
46
+ # Offense count: 2
47
+ Style/MultilineBlockChain:
48
+ Exclude:
49
+ - 'lib/pragmatic_tokenizer/post_processor.rb'
50
+
51
+ # Offense count: 1
52
+ # Configuration parameters: SuspiciousParamNames.
53
+ # SuspiciousParamNames: options, opts, args, params, parameters
54
+ Style/OptionHash:
55
+ Exclude:
56
+ - 'lib/pragmatic_tokenizer/tokenizer.rb'
57
+
58
+ # Offense count: 4
59
+ # Cop supports --auto-correct.
60
+ # Configuration parameters: EnforcedStyle, SupportedStyles, AllowInnerSlashes.
61
+ # SupportedStyles: slashes, percent_r, mixed
62
+ Style/RegexpLiteral:
63
+ Exclude:
64
+ - 'lib/pragmatic_tokenizer/post_processor.rb'
65
+ - 'lib/pragmatic_tokenizer/pre_processor.rb'
66
+ - 'lib/pragmatic_tokenizer/tokenizer.rb'
data/README.md CHANGED
@@ -371,15 +371,8 @@ Contractions: No
371
371
  ## Resources
372
372
 
373
373
  * [The Art of Tokenization](https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en)
374
-
375
374
  * [Handbook Of Natural Language Processing Second Edition](https://karczmarczuk.users.greyc.fr/TEACH/TAL/Doc/Handbook%20Of%20Natural%20Language%20Processing,%20Second%20Edition%20Chapman%20&%20Hall%20Crc%20Machine%20Learning%20&%20Pattern%20Recognition%202010.pdf)
376
375
 
377
- ## Development
378
-
379
- After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
380
-
381
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
382
-
383
376
  ## Contributing
384
377
 
385
378
  1. Fork it ( https://github.com/diasks2/pragmatic_tokenizer/fork )
data/Rakefile CHANGED
@@ -2,4 +2,4 @@ require 'bundler/gem_tasks'
2
2
  require 'rspec/core/rake_task'
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
- task :default => :spec
5
+ task default: :spec
@@ -15,7 +15,7 @@ module PragmaticTokenizer
15
15
  if split_punctuation[0].nil?
16
16
  cleaned_tokens << a
17
17
  else
18
- cleaned_tokens << a.tr(split_punctuation[0],'')
18
+ cleaned_tokens << a.tr(split_punctuation[0], '')
19
19
  if split_punctuation[0].length.eql?(1)
20
20
  cleaned_tokens << split_punctuation[0]
21
21
  else
@@ -28,4 +28,4 @@ module PragmaticTokenizer
28
28
  cleaned_tokens
29
29
  end
30
30
  end
31
- end
31
+ end
@@ -18,18 +18,18 @@ module PragmaticTokenizer
18
18
  cleaned_tokens = []
19
19
  tokens.each_with_index do |_t, i|
20
20
  if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
21
- w = $1
22
- unless abbr[Unicode::downcase(w)] || w =~ /\A[a-z]\z/i ||
23
- w =~ /[a-z](?:\.[a-z])+\z/i
24
- cleaned_tokens << w
21
+ w = Regexp.last_match(1)
22
+ unless abbr[Unicode.downcase(w)] || w =~ /\A[a-z]\z/i ||
23
+ w =~ /[a-z](?:\.[a-z])+\z/i
24
+ cleaned_tokens << w
25
25
  cleaned_tokens << '.'
26
26
  next
27
27
  end
28
28
  end
29
29
  cleaned_tokens << tokens[i]
30
30
  end
31
- if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviations.include?(Unicode::downcase(cleaned_tokens[-1]).chomp("."))
32
- cleaned_tokens[-1] = $1
31
+ if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp("."))
32
+ cleaned_tokens[-1] = Regexp.last_match(1)
33
33
  cleaned_tokens.push '.'
34
34
  end
35
35
  cleaned_tokens
@@ -7,4 +7,4 @@ module PragmaticTokenizer
7
7
  CONTRACTIONS = {}.freeze
8
8
  end
9
9
  end
10
- end
10
+ end
@@ -7,4 +7,4 @@ module PragmaticTokenizer
7
7
  CONTRACTIONS = {}.freeze
8
8
  end
9
9
  end
10
- end
10
+ end
@@ -7,4 +7,4 @@ module PragmaticTokenizer
7
7
  CONTRACTIONS = {}.freeze
8
8
  end
9
9
  end
10
- end
10
+ end
@@ -4,7 +4,7 @@ module PragmaticTokenizer
4
4
  PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^', '”'].freeze
5
5
  PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘", "‘" => "⚭" }.freeze
6
6
  SEMI_PUNCTUATION = ['。', '.', '.'].freeze
7
- ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix'].freeze
7
+ ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxviii xxix xxx xxxi xxxii xxxiii xxxiv xxxv xxxvi xxxvii xxxviii xxxix xl xli xlii xliii xliv xlv xlvi xlvii xlviii xlix l li lii liii liv lv lvi lvii lviii lix lx lxi lxii lxiii lxiv lxv lxvi lxvii lxviii lxix lxx lxxi lxxii lxxiii lxxiv lxxv lxxvi lxxvii lxxviii lxxix lxxx lxxxi lxxxii lxxxiii lxxxiv lxxxv lxxxvi lxxxvii lxxxviii lxxxix xc xci xcii xciii xciv xcv xcvi xcvii xcviii xcix).freeze
8
8
  SPECIAL_CHARACTERS = ['®', '©', '™'].freeze
9
9
  ABBREVIATIONS = [].freeze
10
10
  STOP_WORDS = [].freeze
@@ -17,11 +17,11 @@ module PragmaticTokenizer
17
17
  class SingleQuotes
18
18
  def handle_single_quotes(text)
19
19
  # Convert left quotes to special character except for 'Twas or 'twas
20
- text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
21
- text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
20
+ text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
21
+ text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
22
22
  text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
23
23
  # Separate right single quotes
24
- text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
24
+ text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
25
25
  end
26
26
  end
27
27
  end
@@ -7,4 +7,4 @@ module PragmaticTokenizer
7
7
  CONTRACTIONS = {}.freeze
8
8
  end
9
9
  end
10
- end
10
+ end
@@ -3,8 +3,8 @@ module PragmaticTokenizer
3
3
  module Danish
4
4
  include Languages::Common
5
5
  ABBREVIATIONS = [].freeze
6
- STOP_WORDS = ["De", "I", "af", "aldrig", "alle", "altid", "bagved", "de", "der", "du", "efter", "eller", "en", "endnu", "et", "fjernt", "for", "foran", "fra", "", "gennem", "god", "han", "her", "hos", "hovfor", "hun", "hurtig", "hvad", "hvem", "hvonår", "hvor", "hvordan", "hvorhen", "imod", "ja", "jeg", "langsom", "lidt", "mange", "med", "meget", "mellem", "mere", "mindre", "måske", "nede", "nej", "nok", "nu", "når", "og", "oppe", "", "rask", "sammen", "temmelig", "til", "uden", "udenfor", "under", "ved", "vi"].freeze
6
+ STOP_WORDS = %w(De I af aldrig alle altid bagved de der du efter eller en endnu et fjernt for foran fra få gennem god han her hos hovfor hun hurtig hvad hvem hvonår hvor hvordan hvorhen imod ja jeg langsom lidt mange med meget mellem mere mindre måske nede nej nok nu når og oppe på rask sammen temmelig til uden udenfor under ved vi).freeze
7
7
  CONTRACTIONS = {}.freeze
8
8
  end
9
9
  end
10
- end
10
+ end
@@ -2,27 +2,98 @@ module PragmaticTokenizer
2
2
  module Languages
3
3
  module Deutsch
4
4
  include Languages::Common
5
- ABBREVIATIONS = ['a', 'a.d', 'a.k.a', 'a.s.a.p', 'abg', 'alt', 'apr', 'art', 'aug', 'b',
6
- 'b.a', 'b.s', 'best', 'bgm', 'bldg', 'btw', 'buchst', 'bzgl', 'bzw', 'c',
7
- 'ca', 'co', 'd', 'd.d', 'd.h', 'd.r', 'dergl', 'dez', 'dgl', 'dr', 'dr ',
8
- 'dt', 'dzt', 'e', 'e.l', 'e.u', 'e.v', 'ehem', 'eig', 'etc', 'etc.p.p',
9
- 'eu', 'europ', 'ev', 'ev ', 'evtl', 'f', 'f.d', 'feat', 'feb', 'ff',
10
- 'fr', 'frz', 'ft', 'g', 'gg', 'ggf', 'ggü', 'griech', 'h', 'h.c', 'h.p',
11
- 'hon', 'hosp', 'hr', 'i', 'i.a', 'i.d', 'i.d.r', 'i.f', 'i.p', 'i.z',
12
- 'ii', 'iii', 'inkl', 'int', 'iv', 'ix', 'j', 'jan', 'jul', 'jun', 'k',
13
- 'k.a', 'k.i.z', 'k.o', 'k.u.k', 'kath ', 'l', 'l.a', 'lfd', 'lt', 'ltd',
14
- 'm', 'm.e', 'm.w', 'mag', 'max', 'me', 'med', 'mind', 'mio', 'mme', 'mr',
15
- 'mrd', 'mrs', 'ms', 'mwst', 'mär', 'n', 'nov', 'nr', 'o', 'o.k', 'o.ä',
16
- 'oct', 'okt', 'omg', '', 'p', 'p.a', 'p.m', 'p.s', 'p.t', 'pol', 'pp',
17
- 'prof', 'präs', 'q', 'r', 'r.i.p', 'r.r', 'ranz', 'rd', 'rep', 'rt',
18
- 'russ', 's', 's.g', 'sen', 'sep', 'sog', 'st', 'std', 'str', 't', 'türk',
19
- 'u', 'u.a', 'u.a ', 'u.a.m', 'u.a.v', 'u.k', 'u.s', 'u.s.w', 'u.u',
20
- 'u.v.a', 'u.v.m', 'u', 'ungar', 'usf', 'usw', 'uvm', 'v', 'v.a', 'v.d',
21
- 'v.m', 'vgl', 'vi', 'vii', 'viii', 'vs', 'w', 'wg', 'wr', 'x', 'xi',
22
- 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'y',
23
- 'z', 'z.b', 'z.t', 'z.z', 'z.zt', 'zb', 'zt', 'zw', 'zzt', 'ä', 'ö',
24
- 'öffentl', 'öst', 'österr', 'ü'].freeze
25
- STOP_WORDS = ["a", "ab", "aber", "ach", "acht", "achte", "achten", "achter", "achtes", "ag", "alle", "allein", "allem", "allen", "aller", "allerdings", "alles", "allgemeinen", "als", "also", "am", "an", "andere", "anderen", "andern", "anders", "au", "auch", "auf", "aus", "ausser", "ausserdem", "außer", "außerdem", "b", "bald", "bei", "beide", "beiden", "beim", "beispiel", "bekannt", "bereits", "besonders", "besser", "besten", "bin", "bis", "bisher", "bist", "c", "d", "d.h", "da", "dabei", "dadurch", "dafür", "dagegen", "daher", "dahin", "dahinter", "damals", "damit", "danach", "daneben", "dank", "dann", "daran", "darauf", "daraus", "darf", "darfst", "darin", "darum", "darunter", "darüber", "das", "dasein", "daselbst", "dass", "dasselbe", "davon", "davor", "dazu", "dazwischen", "daß", "dein", "deine", "deinem", "deiner", "dem", "dementsprechend", "demgegenüber", "demgemäss", "demgemäß", "demselben", "demzufolge", "den", "denen", "denn", "denselben", "der", "deren", "derjenige", "derjenigen", "dermassen", "dermaßen", "derselbe", "derselben", "des", "deshalb", "desselben", "dessen", "deswegen", "dich", "die", "diejenige", "diejenigen", "dies", "diese", "dieselbe", "dieselben", "diesem", "diesen", "dieser", "dieses", "dir", "doch", "dort", "drei", "drin", "dritte", "dritten", "dritter", "drittes", "du", "durch", "durchaus", "durfte", "durften", "dürfen", "dürft", "e", "eben", "ebenso", "ehrlich", "ei", "ei,", "eigen", "eigene", "eigenen", "eigener", "eigenes", "ein", "einander", "eine", "einem", "einen", "einer", "eines", "einige", "einigen", "einiger", "einiges", "einmal", "eins", "elf", "en", "ende", "endlich", "entweder", "er", "erst", "erste", "ersten", "erster", "erstes", "es", "etwa", "etwas", "euch", "euer", "eure", "f", "früher", "fünf", "fünfte", "fünften", "fünfter", "fünftes", "für", "g", "gab", "ganz", "ganze", "ganzen", "ganzer", "ganzes", "gar", "gedurft", "gegen", "gegenüber", "gehabt", "gehen", "geht", "gekannt", "gekonnt", "gemacht", "gemocht", "gemusst", "genug", "gerade", "gern", "gesagt", "geschweige", "gewesen", "gewollt", "geworden", "gibt", "ging", "gleich", "gott", "gross", "grosse", "grossen", "grosser", "grosses", "groß", "große", "großen", "großer", "großes", "gut", "gute", "guter", "gutes", "h", "habe", "haben", "habt", "hast", "hat", "hatte", "hatten", "hattest", "hattet", "heisst", "her", "heute", "hier", "hin", "hinter", "hoch", "hätte", "hätten", "i", "ich", "ihm", "ihn", "ihnen", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "im", "immer", "in", "indem", "infolgedessen", "ins", "irgend", "ist", "j", "ja", "jahr", "jahre", "jahren", "je", "jede", "jedem", "jeden", "jeder", "jedermann", "jedermanns", "jedes", "jedoch", "jemand", "jemandem", "jemanden", "jene", "jenem", "jenen", "jener", "jenes", "jetzt", "k", "kam", "kann", "kannst", "kaum", "kein", "keine", "keinem", "keinen", "keiner", "kleine", "kleinen", "kleiner", "kleines", "km", "kommen", "kommt", "konnte", "konnten", "kurz", "können", "könnt", "könnte", "l", "lang", "lange", "leicht", "leide", "lieber", "los", "m", "machen", "macht", "machte", "mag", "magst", "mahn", "man", "manche", "manchem", "manchen", "mancher", "manches", "mann", "mehr", "mein", "meine", "meinem", "meinen", "meiner", "meines", "mensch", "menschen", "mich", "mir", "mit", "mittel", "mochte", "mochten", "morgen", "muss", "musst", "musste", "mussten", "muß", "mußt", "möchte", "mögen", "möglich", "mögt", "müssen", "müsst", "müßt", "n", "na", "nach", "nachdem", "nahm", "natürlich", "neben", "nein", "neue", "neuen", "neun", "neunte", "neunten", "neunter", "neuntes", "nicht", "nichts", "nie", "niemand", "niemandem", "niemanden", "noch", "nun", "nur", "o", "ob", "oben", "oder", "offen", "oft", "ohne", "p", "q", "r", "recht", "rechte", "rechten", "rechter", "rechtes", "richtig", "rund", "s", "sa", "sache", "sagt", "sagte", "sah", "satt", "schlecht", "schon", "sechs", "sechste", "sechsten", "sechster", "sechstes", "sehr", "sei", "seid", "seien", "sein", "seine", "seinem", "seinen", "seiner", "seines", "seit", "seitdem", "selbst", "sich", "sie", "sieben", "siebente", "siebenten", "siebenter", "siebentes", "sind", "so", "solang", "solche", "solchem", "solchen", "solcher", "solches", "soll", "sollen", "sollst", "sollt", "sollte", "sollten", "sondern", "sonst", "soweit", "sowie", "später", "statt", "t", "tag", "tage", "tagen", "tat", "teil", "tel", "tritt", "trotzdem", "tun", "u", "uhr", "um", "und", "und?", "uns", "unser", "unsere", "unserer", "unter", "v", "vergangenen", "viel", "viele", "vielem", "vielen", "vielleicht", "vier", "vierte", "vierten", "vierter", "viertes", "vom", "von", "vor", "w", "wahr?", "wann", "war", "waren", "wart", "warum", "was", "wegen", "weil", "weit", "weiter", "weitere", "weiteren", "weiteres", "welche", "welchem", "welchen", "welcher", "welches", "wem", "wen", "wenig", "wenige", "weniger", "weniges", "wenigstens", "wenn", "wer", "werde", "werden", "werdet", "weshalb", "wessen", "wie", "wieder", "wieso", "will", "willst", "wir", "wird", "wirklich", "wirst", "wo", "woher", "wohin", "wohl", "wollen", "wollt", "wollte", "wollten", "worden", "wurde", "wurden", "während", "währenddem", "währenddessen", "wäre", "würde", "würden", "x", "y", "z", "z.b", "zehn", "zehnte", "zehnten", "zehnter", "zehntes", "zeit", "zu", "zuerst", "zugleich", "zum", "zunächst", "zur", "zurück", "zusammen", "zwanzig", "zwar", "zwei", "zweite", "zweiten", "zweiter", "zweites", "zwischen", "zwölf", "über", "überhaupt", "übrigens"].freeze
5
+ ABBREVIATIONS = [
6
+ 'a', 'a.d', 'a.k.a', 'a.s.a.p', 'abg', 'alt', 'apr', 'art', 'aug', 'b',
7
+ 'b.a', 'b.s', 'best', 'bgm', 'bldg', 'btw', 'buchst', 'bzgl', 'bzw', 'c',
8
+ 'ca', 'co', 'd', 'd.d', 'd.h', 'd.r', 'dergl', 'dez', 'dgl', 'dr', 'dr ',
9
+ 'dt', 'dzt', 'e', 'e.l', 'e.u', 'e.v', 'ehem', 'eig', 'etc', 'etc.p.p',
10
+ 'eu', 'europ', 'ev', 'ev ', 'evtl', 'f', 'f.d', 'feat', 'feb', 'ff',
11
+ 'fr', 'frz', 'ft', 'g', 'gg', 'ggf', 'ggü', 'griech', 'h', 'h.c', 'h.p',
12
+ 'hon', 'hosp', 'hr', 'i', 'i.a', 'i.d', 'i.d.r', 'i.f', 'i.p', 'i.z',
13
+ 'ii', 'iii', 'inkl', 'int', 'iv', 'ix', 'j', 'jan', 'jul', 'jun', 'k',
14
+ 'k.a', 'k.i.z', 'k.o', 'k.u.k', 'kath ', 'l', 'l.a', 'lfd', 'lt', 'ltd',
15
+ 'm', 'm.e', 'm.w', 'mag', 'max', 'me', 'med', 'mind', 'mio', 'mme', 'mr',
16
+ 'mrd', 'mrs', 'ms', 'mwst', 'mär', 'n', 'nov', 'nr', 'o', 'o.k', 'o.ä',
17
+ 'oct', 'okt', 'omg', '', 'p', 'p.a', 'p.m', 'p.s', 'p.t', 'pol', 'pp',
18
+ 'prof', 'präs', 'q', 'r', 'r.i.p', 'r.r', 'ranz', 'rd', 'rep', 'rt',
19
+ 'russ', 's', 's.g', 'sen', 'sep', 'sog', 'st', 'std', 'str', 't', 'türk',
20
+ 'u', 'u.a', 'u.a ', 'u.a.m', 'u.a.v', 'u.k', 'u.s', 'u.s.w', 'u.u',
21
+ 'u.v.a', 'u.v.m', 'u.ä', 'ungar', 'usf', 'usw', 'uvm', 'v', 'v.a', 'v.d',
22
+ 'v.m', 'vgl', 'vi', 'vii', 'viii', 'vs', 'w', 'wg', 'wr', 'x', 'xi',
23
+ 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'y',
24
+ 'z', 'z.b', 'z.t', 'z.z', 'z.zt', 'zb', 'zt', 'zw', 'zzt', 'ä', 'ö',
25
+ 'öffentl', 'öst', 'österr', 'ü'].freeze
26
+ STOP_WORDS = [
27
+ "a", "ab", "aber", "ach", "acht", "achte", "achten", "achter", "achtes",
28
+ "ag", "alle", "allein", "allem", "allen", "aller", "allerdings", "alles",
29
+ "allgemeinen", "als", "also", "am", "an", "andere", "anderen", "andern",
30
+ "anders", "au", "auch", "auf", "aus", "ausser", "ausserdem", "außer",
31
+ "außerdem", "b", "bald", "bei", "beide", "beiden", "beim", "beispiel",
32
+ "bekannt", "bereits", "besonders", "besser", "besten", "bin", "bis",
33
+ "bisher", "bist", "c", "d", "d.h", "da", "dabei", "dadurch", "dafür",
34
+ "dagegen", "daher", "dahin", "dahinter", "damals", "damit", "danach",
35
+ "daneben", "dank", "dann", "daran", "darauf", "daraus", "darf", "darfst",
36
+ "darin", "darum", "darunter", "darüber", "das", "dasein", "daselbst", "dass",
37
+ "dasselbe", "davon", "davor", "dazu", "dazwischen", "daß", "dein", "deine",
38
+ "deinem", "deiner", "dem", "dementsprechend", "demgegenüber", "demgemäss",
39
+ "demgemäß", "demselben", "demzufolge", "den", "denen", "denn", "denselben",
40
+ "der", "deren", "derjenige", "derjenigen", "dermassen", "dermaßen", "derselbe",
41
+ "derselben", "des", "deshalb", "desselben", "dessen", "deswegen", "dich", "die",
42
+ "diejenige", "diejenigen", "dies", "diese", "dieselbe", "dieselben", "diesem",
43
+ "diesen", "dieser", "dieses", "dir", "doch", "dort", "drei", "drin", "dritte",
44
+ "dritten", "dritter", "drittes", "du", "durch", "durchaus", "durfte", "durften",
45
+ "dürfen", "dürft", "e", "eben", "ebenso", "ehrlich", "ei", "ei,", "eigen",
46
+ "eigene", "eigenen", "eigener", "eigenes", "ein", "einander", "eine", "einem",
47
+ "einen", "einer", "eines", "einige", "einigen", "einiger", "einiges", "einmal",
48
+ "eins", "elf", "en", "ende", "endlich", "entweder", "er", "erst", "erste",
49
+ "ersten", "erster", "erstes", "es", "etwa", "etwas", "euch", "euer", "eure", "f",
50
+ "früher", "fünf", "fünfte", "fünften", "fünfter", "fünftes", "für", "g", "gab",
51
+ "ganz", "ganze", "ganzen", "ganzer", "ganzes", "gar", "gedurft", "gegen",
52
+ "gegenüber", "gehabt", "gehen", "geht", "gekannt", "gekonnt", "gemacht",
53
+ "gemocht", "gemusst", "genug", "gerade", "gern", "gesagt", "geschweige",
54
+ "gewesen", "gewollt", "geworden", "gibt", "ging", "gleich", "gott", "gross",
55
+ "grosse", "grossen", "grosser", "grosses", "groß", "große", "großen", "großer",
56
+ "großes", "gut", "gute", "guter", "gutes", "h", "habe", "haben", "habt", "hast",
57
+ "hat", "hatte", "hatten", "hattest", "hattet", "heisst", "her", "heute", "hier",
58
+ "hin", "hinter", "hoch", "hätte", "hätten", "i", "ich", "ihm", "ihn", "ihnen",
59
+ "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "im", "immer", "in", "indem",
60
+ "infolgedessen", "ins", "irgend", "ist", "j", "ja", "jahr", "jahre", "jahren",
61
+ "je", "jede", "jedem", "jeden", "jeder", "jedermann", "jedermanns", "jedes",
62
+ "jedoch", "jemand", "jemandem", "jemanden", "jene", "jenem", "jenen", "jener",
63
+ "jenes", "jetzt", "k", "kam", "kann", "kannst", "kaum", "kein", "keine", "keinem",
64
+ "keinen", "keiner", "kleine", "kleinen", "kleiner", "kleines", "km", "kommen",
65
+ "kommt", "konnte", "konnten", "kurz", "können", "könnt", "könnte", "l", "lang",
66
+ "lange", "leicht", "leide", "lieber", "los", "m", "machen", "macht", "machte",
67
+ "mag", "magst", "mahn", "man", "manche", "manchem", "manchen", "mancher", "manches",
68
+ "mann", "mehr", "mein", "meine", "meinem", "meinen", "meiner", "meines", "mensch",
69
+ "menschen", "mich", "mir", "mit", "mittel", "mochte", "mochten", "morgen", "muss",
70
+ "musst", "musste", "mussten", "muß", "mußt", "möchte", "mögen", "möglich", "mögt",
71
+ "müssen", "müsst", "müßt", "n", "na", "nach", "nachdem", "nahm", "natürlich",
72
+ "neben", "nein", "neue", "neuen", "neun", "neunte", "neunten", "neunter", "neuntes",
73
+ "nicht", "nichts", "nie", "niemand", "niemandem", "niemanden", "noch", "nun", "nur",
74
+ "o", "ob", "oben", "oder", "offen", "oft", "ohne", "p", "q", "r", "recht", "rechte",
75
+ "rechten", "rechter", "rechtes", "richtig", "rund", "s", "sa", "sache", "sagt",
76
+ "sagte", "sah", "satt", "schlecht", "schon", "sechs", "sechste", "sechsten",
77
+ "sechster", "sechstes", "sehr", "sei", "seid", "seien", "sein", "seine", "seinem",
78
+ "seinen", "seiner", "seines", "seit", "seitdem", "selbst", "sich", "sie", "sieben",
79
+ "siebente", "siebenten", "siebenter", "siebentes", "sind", "so", "solang", "solche",
80
+ "solchem", "solchen", "solcher", "solches", "soll", "sollen", "sollst", "sollt",
81
+ "sollte", "sollten", "sondern", "sonst", "soweit", "sowie", "später", "statt", "t",
82
+ "tag", "tage", "tagen", "tat", "teil", "tel", "tritt", "trotzdem", "tun", "u",
83
+ "uhr", "um", "und", "und?", "uns", "unser", "unsere", "unserer", "unter", "v",
84
+ "vergangenen", "viel", "viele", "vielem", "vielen", "vielleicht", "vier", "vierte",
85
+ "vierten", "vierter", "viertes", "vom", "von", "vor", "w", "wahr?", "wann", "war",
86
+ "waren", "wart", "warum", "was", "wegen", "weil", "weit", "weiter", "weitere",
87
+ "weiteren", "weiteres", "welche", "welchem", "welchen", "welcher", "welches", "wem",
88
+ "wen", "wenig", "wenige", "weniger", "weniges", "wenigstens", "wenn", "wer", "werde",
89
+ "werden", "werdet", "weshalb", "wessen", "wie", "wieder", "wieso", "will", "willst",
90
+ "wir", "wird", "wirklich", "wirst", "wo", "woher", "wohin", "wohl", "wollen", "wollt",
91
+ "wollte", "wollten", "worden", "wurde", "wurden", "während", "währenddem",
92
+ "währenddessen", "wäre", "würde", "würden", "x", "y", "z", "z.b", "zehn", "zehnte",
93
+ "zehnten", "zehnter", "zehntes", "zeit", "zu", "zuerst", "zugleich", "zum",
94
+ "zunächst", "zur", "zurück", "zusammen", "zwanzig", "zwar", "zwei", "zweite",
95
+ "zweiten", "zweiter", "zweites", "zwischen", "zwölf", "über", "überhaupt",
96
+ "übrigens"].freeze
26
97
  CONTRACTIONS = {
27
98
  "auf's" => "auf das",
28
99
  "can't" => "cannot",
@@ -63,7 +134,7 @@ module PragmaticTokenizer
63
134
  "wird's" => "wird es",
64
135
  "wär's" => "wäre es",
65
136
  "ö's" => "österreichs"
66
- }
137
+ }.freeze
67
138
  end
68
139
  end
69
- end
140
+ end
@@ -3,8 +3,8 @@ module PragmaticTokenizer
3
3
  module Dutch
4
4
  include Languages::Common
5
5
  ABBREVIATIONS = [].freeze
6
- STOP_WORDS = ["aan", "af", "al", "als", "bij", "dan", "dat", "die", "dit", "een", "en", "er", "had", "heb", "hem", "het", "hij", "hoe", "hun", "ik", "in", "is", "je", "kan", "me", "men", "met", "mij", "nog", "nu", "of", "ons", "ook", "te", "tot", "uit", "van", "was", "wat", "we", "wel", "wij", "zal", "ze", "zei", "zij", "zo", "zou", "aan", "aangaande", "aangezien", "achter", "achterna", "afgelopen", "aldaar", "aldus", "alhoewel", "alias", "alle", "allebei", "alleen", "alsnog", "altijd", "altoos", "ander", "andere", "anders", "anderszins", "behalve", "behoudens", "beide", "beiden", "ben", "beneden", "bent", "bepaald", "betreffende", "binnen", "binnenin", "boven", "bovenal", "bovendien", "bovengenoemd", "bovenstaand", "bovenvermeld", "buiten", "daar", "daarheen", "daarin", "daarna", "daarnet", "daarom", "daarop", "daarvanlangs", "de", "dikwijls", "door", "doorgaand", "dus", "echter", "eer", "eerdat", "eerder", "eerlang", "eerst", "elk", "elke", "enig", "enigszins", "enkel", "erdoor", "even", "eveneens", "evenwel", "gauw", "gedurende", "geen", "gehad", "gekund", "geleden", "gelijk", "gemoeten", "gemogen", "geweest", "gewoon", "gewoonweg", "haar", "hadden", "hare", "hebben", "hebt", "heeft", "hen", "hierbeneden", "hierboven", "hoewel", "hunne", "ikzelf", "inmiddels", "inzake", "jezelf", "jij", "jijzelf", "jou", "jouw", "jouwe", "juist", "jullie", "klaar", "kon", "konden", "krachtens", "kunnen", "kunt", "later", "liever", "maar", "mag", "meer", "mezelf", "mijn", "mijnent", "mijner", "mijzelf", "misschien", "mocht", "mochten", "moest", "moesten", "moet", "moeten", "mogen", "na", "naar", "nadat", "net", "niet", "noch", "nogal", "ofschoon", "om", "omdat", "omhoog", "omlaag", "omstreeks", "omtrent", "omver", "onder", "ondertussen", "ongeveer", "onszelf", "onze", "op", "opnieuw", "opzij", "over", "overeind", "overigens", "pas", "precies", "reeds", "rond", "rondom", "sedert", "sinds", "sindsdien", "slechts", "sommige", "spoedig", "steeds", "tamelijk", "tenzij", "terwijl", "thans", "tijdens", "toch", "toen", "toenmaals", "toenmalig", "totdat", "tussen", "uitgezonderd", "vaakwat", "vandaan", "vanuit", "vanwege", "veeleer", "verder", "vervolgens", "vol", "volgens", "voor", "vooraf", "vooral", "vooralsnog", "voorbij", "voordat", "voordezen", "voordien", "voorheen", "voorop", "vooruit", "vrij", "vroeg", "waar", "waarom", "wanneer", "want", "waren", "weer", "weg", "wegens", "weldra", "welk", "welke", "wie", "wiens", "wier", "wijzelf", "zelfs", "zichzelf", "zijn", "zijne", "zodra", "zonder", "zouden", "zowat", "zulke", "zullen", "zult", "worden", "wordt", "deze"].freeze
6
+ STOP_WORDS = %w(aan af al als bij dan dat die dit een en er had heb hem het hij hoe hun ik in is je kan me men met mij nog nu of ons ook te tot uit van was wat we wel wij zal ze zei zij zo zou aan aangaande aangezien achter achterna afgelopen aldaar aldus alhoewel alias alle allebei alleen alsnog altijd altoos ander andere anders anderszins behalve behoudens beide beiden ben beneden bent bepaald betreffende binnen binnenin boven bovenal bovendien bovengenoemd bovenstaand bovenvermeld buiten daar daarheen daarin daarna daarnet daarom daarop daarvanlangs de dikwijls door doorgaand dus echter eer eerdat eerder eerlang eerst elk elke enig enigszins enkel erdoor even eveneens evenwel gauw gedurende geen gehad gekund geleden gelijk gemoeten gemogen geweest gewoon gewoonweg haar hadden hare hebben hebt heeft hen hierbeneden hierboven hoewel hunne ikzelf inmiddels inzake jezelf jij jijzelf jou jouw jouwe juist jullie klaar kon konden krachtens kunnen kunt later liever maar mag meer mezelf mijn mijnent mijner mijzelf misschien mocht mochten moest moesten moet moeten mogen na naar nadat net niet noch nogal ofschoon om omdat omhoog omlaag omstreeks omtrent omver onder ondertussen ongeveer onszelf onze op opnieuw opzij over overeind overigens pas precies reeds rond rondom sedert sinds sindsdien slechts sommige spoedig steeds tamelijk tenzij terwijl thans tijdens toch toen toenmaals toenmalig totdat tussen uitgezonderd vaakwat vandaan vanuit vanwege veeleer verder vervolgens vol volgens voor vooraf vooral vooralsnog voorbij voordat voordezen voordien voorheen voorop vooruit vrij vroeg waar waarom wanneer want waren weer weg wegens weldra welk welke wie wiens wier wijzelf zelfs zichzelf zijn zijne zodra zonder zouden zowat zulke zullen zult worden wordt deze).freeze
7
7
  CONTRACTIONS = {}.freeze
8
8
  end
9
9
  end
10
- end
10
+ end