pragmatic_tokenizer 1.4.0 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +184 -0
- data/.rubocop_todo.yml +66 -0
- data/README.md +0 -7
- data/Rakefile +1 -1
- data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
- data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
- data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages.rb +28 -28
- data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
- data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
- data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +1 -0
- data/spec/languages/bulgarian_spec.rb +17 -13
- data/spec/languages/deutsch_spec.rb +110 -86
- data/spec/languages/english_spec.rb +465 -342
- data/spec/languages/french_spec.rb +3 -2
- data/spec/performance_spec.rb +7 -7
- data/spec/pragmatic_tokenizer_spec.rb +8 -8
- metadata +18 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 41ba3758c8ff32d83451b66e0b28ca8f33248843
|
4
|
+
data.tar.gz: 1f96d2fbf5a7a66a3031631b58a9b91b77991fc5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7e62958fbf69b55d62c00e391a9c6ed8da4c55c0336e0160c172363198d3a1a711dbdc6310a94e54a9a65316b9c34913a886b92ee7eb6afe78530556a185663b
|
7
|
+
data.tar.gz: 9fbb6d481494ef235fd1d74b9c34833bb96887c29d530c28add6ee3e5109379b051f2a7520208f9385d686e0ec2ba8c655c849dfb43a6d7b850c3ec0092b526f
|
data/.rubocop.yml
ADDED
@@ -0,0 +1,184 @@
|
|
1
|
+
#inherit_from: .rubocop_todo.yml
|
2
|
+
|
3
|
+
# check https://gist.github.com/jhass/a5ae80d87f18e53e7b56/84972ca6c0c5a59768ae07ee0a639d03cc72d375
|
4
|
+
|
5
|
+
Metrics/LineLength:
|
6
|
+
Max: 99
|
7
|
+
Enabled: false # as it complains about long comments too
|
8
|
+
|
9
|
+
Style/EmptyLinesAroundClassBody:
|
10
|
+
Enabled: false
|
11
|
+
|
12
|
+
Style/EmptyLinesAroundModuleBody:
|
13
|
+
Enabled: false
|
14
|
+
|
15
|
+
Style/AsciiComments:
|
16
|
+
Enabled: false
|
17
|
+
|
18
|
+
Style/ExtraSpacing:
|
19
|
+
AllowForAlignment: true
|
20
|
+
|
21
|
+
Style/IndentationConsistency:
|
22
|
+
EnforcedStyle: rails
|
23
|
+
|
24
|
+
Style/AndOr:
|
25
|
+
Enabled: false # complains about "redirect_to ... and return" in Rails controllers
|
26
|
+
|
27
|
+
# Fail is an alias of raise. Avoid aliases, it's more cognitive load for no gain.
|
28
|
+
# The argument that fail should be used to abort the program is wrong too,
|
29
|
+
# there's Kernel#abort for that.
|
30
|
+
Style/SignalException:
|
31
|
+
EnforcedStyle: only_raise
|
32
|
+
|
33
|
+
Style/StringLiterals:
|
34
|
+
Enabled: false
|
35
|
+
|
36
|
+
# allow Rubymine "Continuation indent" of 4 characters:
|
37
|
+
Style/CaseIndentation:
|
38
|
+
IndentationWidth: 4
|
39
|
+
Style/FirstParameterIndentation:
|
40
|
+
IndentationWidth: 4
|
41
|
+
Style/IndentArray:
|
42
|
+
IndentationWidth: 4
|
43
|
+
Style/IndentAssignment:
|
44
|
+
IndentationWidth: 4
|
45
|
+
Style/IndentHash:
|
46
|
+
IndentationWidth: 4
|
47
|
+
|
48
|
+
Style/MultilineMethodCallIndentation:
|
49
|
+
EnforcedStyle: indented
|
50
|
+
IndentationWidth: 4
|
51
|
+
|
52
|
+
# unsure how we'd want it
|
53
|
+
#Style/MultilineOperationIndentation:
|
54
|
+
# IndentationWidth: 4
|
55
|
+
|
56
|
+
# these are not continued (e.g. 'private'), so we keep them at 2 chars
|
57
|
+
Style/AccessModifierIndentation:
|
58
|
+
IndentationWidth: 2
|
59
|
+
|
60
|
+
Style/SpaceAroundOperators:
|
61
|
+
AllowForAlignment: true
|
62
|
+
|
63
|
+
# Most readable form.
|
64
|
+
Style/AlignHash:
|
65
|
+
EnforcedHashRocketStyle: table
|
66
|
+
EnforcedColonStyle: table
|
67
|
+
|
68
|
+
# Mixing the styles looks just silly.
|
69
|
+
Style/HashSyntax:
|
70
|
+
EnforcedStyle: ruby19_no_mixed_keys
|
71
|
+
|
72
|
+
# consider this to keep method parameters short
|
73
|
+
# No space makes the method definition shorter and differentiates
|
74
|
+
# from a regular assignment.
|
75
|
+
Style/SpaceAroundEqualsInParameterDefault:
|
76
|
+
EnforcedStyle: no_space
|
77
|
+
|
78
|
+
Rails:
|
79
|
+
Enabled: false
|
80
|
+
|
81
|
+
AllCops:
|
82
|
+
Include:
|
83
|
+
- '**/Rakefile'
|
84
|
+
Exclude:
|
85
|
+
- 'bin/**/*'
|
86
|
+
TargetRubyVersion:
|
87
|
+
2.2
|
88
|
+
|
89
|
+
# These are all the cops that are disabled in the default configuration.
|
90
|
+
|
91
|
+
Style/AutoResourceCleanup:
|
92
|
+
Description: 'Suggests the usage of an auto resource cleanup version of a method (if available).'
|
93
|
+
Enabled: true
|
94
|
+
|
95
|
+
Style/CollectionMethods:
|
96
|
+
Description: 'Preferred collection methods.'
|
97
|
+
StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#map-find-select-reduce-size'
|
98
|
+
Enabled: true
|
99
|
+
|
100
|
+
#Style/Encoding:
|
101
|
+
# Description: 'Use UTF-8 as the source file encoding.'
|
102
|
+
# StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#utf-8'
|
103
|
+
# Enabled: true
|
104
|
+
|
105
|
+
#Style/InlineComment:
|
106
|
+
# Description: 'Avoid inline comments.'
|
107
|
+
# Enabled: true
|
108
|
+
|
109
|
+
Style/FirstArrayElementLineBreak:
|
110
|
+
Description: >-
|
111
|
+
Checks for a line break before the first element in a
|
112
|
+
multi-line array.
|
113
|
+
Enabled: true
|
114
|
+
|
115
|
+
Style/FirstHashElementLineBreak:
|
116
|
+
Description: >-
|
117
|
+
Checks for a line break before the first element in a
|
118
|
+
multi-line hash.
|
119
|
+
Enabled: true
|
120
|
+
|
121
|
+
Style/FirstMethodArgumentLineBreak:
|
122
|
+
Description: >-
|
123
|
+
Checks for a line break before the first argument in a
|
124
|
+
multi-line method call.
|
125
|
+
Enabled: true
|
126
|
+
|
127
|
+
Style/FirstMethodParameterLineBreak:
|
128
|
+
Description: >-
|
129
|
+
Checks for a line break before the first parameter in a
|
130
|
+
multi-line method parameter definition.
|
131
|
+
Enabled: true
|
132
|
+
|
133
|
+
Style/MethodCalledOnDoEndBlock:
|
134
|
+
Description: 'Avoid chaining a method call on a do...end block.'
|
135
|
+
StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#single-line-blocks'
|
136
|
+
Enabled: true
|
137
|
+
|
138
|
+
Style/MissingElse:
|
139
|
+
Description: >-
|
140
|
+
Require if/case expressions to have an else branches.
|
141
|
+
If enabled, it is recommended that
|
142
|
+
Style/UnlessElse and Style/EmptyElse be enabled.
|
143
|
+
This will conflict with Style/EmptyElse if
|
144
|
+
Style/EmptyElse is configured to style "both"
|
145
|
+
Enabled: false
|
146
|
+
EnforcedStyle: both
|
147
|
+
SupportedStyles:
|
148
|
+
# if - warn when an if expression is missing an else branch
|
149
|
+
# case - warn when a case expression is missing an else branch
|
150
|
+
# both - warn when an if or case expression is missing an else branch
|
151
|
+
- if
|
152
|
+
- case
|
153
|
+
- both
|
154
|
+
|
155
|
+
#Style/MultilineAssignmentLayout:
|
156
|
+
# Description: 'Check for a newline after the assignment operator in multi-line assignments.'
|
157
|
+
# StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#indent-conditional-assignment'
|
158
|
+
# Enabled: true
|
159
|
+
|
160
|
+
Style/MutableConstant:
|
161
|
+
Description: 'Do not assign mutable objects to constants.'
|
162
|
+
Enabled: true
|
163
|
+
|
164
|
+
Style/OptionHash:
|
165
|
+
Description: "Don't use option hashes when you can use keyword arguments."
|
166
|
+
Enabled: true
|
167
|
+
|
168
|
+
Style/Send:
|
169
|
+
Description: 'Prefer `Object#__send__` or `Object#public_send` to `send`, as `send` may overlap with existing methods.'
|
170
|
+
StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#prefer-public-send'
|
171
|
+
Enabled: true
|
172
|
+
|
173
|
+
Style/StringMethods:
|
174
|
+
Description: 'Checks if configured preferred methods are used over non-preferred.'
|
175
|
+
Enabled: true
|
176
|
+
|
177
|
+
#Style/SymbolArray:
|
178
|
+
# Description: 'Use %i or %I for arrays of symbols.'
|
179
|
+
# StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#percent-i'
|
180
|
+
# Enabled: true
|
181
|
+
|
182
|
+
Lint/LiteralInInterpolation:
|
183
|
+
Description: 'Avoid interpolating literals in strings'
|
184
|
+
AutoCorrect: true
|
data/.rubocop_todo.yml
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
# This configuration was generated by
|
2
|
+
# `rubocop --auto-gen-config`
|
3
|
+
# on 2016-01-23 03:18:41 +0100 using RuboCop version 0.36.0.
|
4
|
+
# The point is for the user to remove these configuration records
|
5
|
+
# one by one as the offenses are removed from the code base.
|
6
|
+
# Note that changes in the inspected code, or installation of new
|
7
|
+
# versions of RuboCop, may require this file to be generated again.
|
8
|
+
|
9
|
+
# Offense count: 11
|
10
|
+
Metrics/AbcSize:
|
11
|
+
Max: 137
|
12
|
+
|
13
|
+
# Offense count: 2
|
14
|
+
# Configuration parameters: CountComments.
|
15
|
+
Metrics/ClassLength:
|
16
|
+
Max: 214
|
17
|
+
|
18
|
+
# Offense count: 9
|
19
|
+
Metrics/CyclomaticComplexity:
|
20
|
+
Max: 41
|
21
|
+
|
22
|
+
# Offense count: 8
|
23
|
+
# Configuration parameters: CountComments.
|
24
|
+
Metrics/MethodLength:
|
25
|
+
Max: 57
|
26
|
+
|
27
|
+
# Offense count: 1
|
28
|
+
# Configuration parameters: CountComments.
|
29
|
+
Metrics/ModuleLength:
|
30
|
+
Max: 134
|
31
|
+
|
32
|
+
# Offense count: 6
|
33
|
+
Metrics/PerceivedComplexity:
|
34
|
+
Max: 43
|
35
|
+
|
36
|
+
# Offense count: 4
|
37
|
+
# Cop supports --auto-correct.
|
38
|
+
Style/CommentIndentation:
|
39
|
+
Exclude:
|
40
|
+
- 'lib/pragmatic_tokenizer/tokenizer.rb'
|
41
|
+
|
42
|
+
# Offense count: 31
|
43
|
+
Style/Documentation:
|
44
|
+
Enabled: false
|
45
|
+
|
46
|
+
# Offense count: 2
|
47
|
+
Style/MultilineBlockChain:
|
48
|
+
Exclude:
|
49
|
+
- 'lib/pragmatic_tokenizer/post_processor.rb'
|
50
|
+
|
51
|
+
# Offense count: 1
|
52
|
+
# Configuration parameters: SuspiciousParamNames.
|
53
|
+
# SuspiciousParamNames: options, opts, args, params, parameters
|
54
|
+
Style/OptionHash:
|
55
|
+
Exclude:
|
56
|
+
- 'lib/pragmatic_tokenizer/tokenizer.rb'
|
57
|
+
|
58
|
+
# Offense count: 4
|
59
|
+
# Cop supports --auto-correct.
|
60
|
+
# Configuration parameters: EnforcedStyle, SupportedStyles, AllowInnerSlashes.
|
61
|
+
# SupportedStyles: slashes, percent_r, mixed
|
62
|
+
Style/RegexpLiteral:
|
63
|
+
Exclude:
|
64
|
+
- 'lib/pragmatic_tokenizer/post_processor.rb'
|
65
|
+
- 'lib/pragmatic_tokenizer/pre_processor.rb'
|
66
|
+
- 'lib/pragmatic_tokenizer/tokenizer.rb'
|
data/README.md
CHANGED
@@ -371,15 +371,8 @@ Contractions: No
|
|
371
371
|
## Resources
|
372
372
|
|
373
373
|
* [The Art of Tokenization](https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en)
|
374
|
-
|
375
374
|
* [Handbook Of Natural Language Processing Second Edition](https://karczmarczuk.users.greyc.fr/TEACH/TAL/Doc/Handbook%20Of%20Natural%20Language%20Processing,%20Second%20Edition%20Chapman%20&%20Hall%20Crc%20Machine%20Learning%20&%20Pattern%20Recognition%202010.pdf)
|
376
375
|
|
377
|
-
## Development
|
378
|
-
|
379
|
-
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
|
380
|
-
|
381
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
382
|
-
|
383
376
|
## Contributing
|
384
377
|
|
385
378
|
1. Fork it ( https://github.com/diasks2/pragmatic_tokenizer/fork )
|
data/Rakefile
CHANGED
@@ -15,7 +15,7 @@ module PragmaticTokenizer
|
|
15
15
|
if split_punctuation[0].nil?
|
16
16
|
cleaned_tokens << a
|
17
17
|
else
|
18
|
-
cleaned_tokens << a.tr(split_punctuation[0],'')
|
18
|
+
cleaned_tokens << a.tr(split_punctuation[0], '')
|
19
19
|
if split_punctuation[0].length.eql?(1)
|
20
20
|
cleaned_tokens << split_punctuation[0]
|
21
21
|
else
|
@@ -28,4 +28,4 @@ module PragmaticTokenizer
|
|
28
28
|
cleaned_tokens
|
29
29
|
end
|
30
30
|
end
|
31
|
-
end
|
31
|
+
end
|
@@ -18,18 +18,18 @@ module PragmaticTokenizer
|
|
18
18
|
cleaned_tokens = []
|
19
19
|
tokens.each_with_index do |_t, i|
|
20
20
|
if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
|
21
|
-
w =
|
22
|
-
unless abbr[Unicode
|
23
|
-
|
24
|
-
cleaned_tokens <<
|
21
|
+
w = Regexp.last_match(1)
|
22
|
+
unless abbr[Unicode.downcase(w)] || w =~ /\A[a-z]\z/i ||
|
23
|
+
w =~ /[a-z](?:\.[a-z])+\z/i
|
24
|
+
cleaned_tokens << w
|
25
25
|
cleaned_tokens << '.'
|
26
26
|
next
|
27
27
|
end
|
28
28
|
end
|
29
29
|
cleaned_tokens << tokens[i]
|
30
30
|
end
|
31
|
-
if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviations.include?(Unicode
|
32
|
-
cleaned_tokens[-1] =
|
31
|
+
if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp("."))
|
32
|
+
cleaned_tokens[-1] = Regexp.last_match(1)
|
33
33
|
cleaned_tokens.push '.'
|
34
34
|
end
|
35
35
|
cleaned_tokens
|
@@ -4,7 +4,7 @@ module PragmaticTokenizer
|
|
4
4
|
PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^', '”'].freeze
|
5
5
|
PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘", "‘" => "⚭" }.freeze
|
6
6
|
SEMI_PUNCTUATION = ['。', '.', '.'].freeze
|
7
|
-
ROMAN_NUMERALS =
|
7
|
+
ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxviii xxix xxx xxxi xxxii xxxiii xxxiv xxxv xxxvi xxxvii xxxviii xxxix xl xli xlii xliii xliv xlv xlvi xlvii xlviii xlix l li lii liii liv lv lvi lvii lviii lix lx lxi lxii lxiii lxiv lxv lxvi lxvii lxviii lxix lxx lxxi lxxii lxxiii lxxiv lxxv lxxvi lxxvii lxxviii lxxix lxxx lxxxi lxxxii lxxxiii lxxxiv lxxxv lxxxvi lxxxvii lxxxviii lxxxix xc xci xcii xciii xciv xcv xcvi xcvii xcviii xcix).freeze
|
8
8
|
SPECIAL_CHARACTERS = ['®', '©', '™'].freeze
|
9
9
|
ABBREVIATIONS = [].freeze
|
10
10
|
STOP_WORDS = [].freeze
|
@@ -17,11 +17,11 @@ module PragmaticTokenizer
|
|
17
17
|
class SingleQuotes
|
18
18
|
def handle_single_quotes(text)
|
19
19
|
# Convert left quotes to special character except for 'Twas or 'twas
|
20
|
-
text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) {
|
21
|
-
text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) {
|
20
|
+
text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
21
|
+
text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
|
22
22
|
text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
|
23
23
|
# Separate right single quotes
|
24
|
-
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) {
|
24
|
+
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
25
25
|
end
|
26
26
|
end
|
27
27
|
end
|
@@ -3,8 +3,8 @@ module PragmaticTokenizer
|
|
3
3
|
module Danish
|
4
4
|
include Languages::Common
|
5
5
|
ABBREVIATIONS = [].freeze
|
6
|
-
STOP_WORDS =
|
6
|
+
STOP_WORDS = %w(De I af aldrig alle altid bagved de der du efter eller en endnu et fjernt for foran fra få gennem god han her hos hovfor hun hurtig hvad hvem hvonår hvor hvordan hvorhen imod ja jeg langsom lidt mange med meget mellem mere mindre måske nede nej nok nu når og oppe på rask sammen temmelig til uden udenfor under ved vi).freeze
|
7
7
|
CONTRACTIONS = {}.freeze
|
8
8
|
end
|
9
9
|
end
|
10
|
-
end
|
10
|
+
end
|
@@ -2,27 +2,98 @@ module PragmaticTokenizer
|
|
2
2
|
module Languages
|
3
3
|
module Deutsch
|
4
4
|
include Languages::Common
|
5
|
-
ABBREVIATIONS = [
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
5
|
+
ABBREVIATIONS = [
|
6
|
+
'a', 'a.d', 'a.k.a', 'a.s.a.p', 'abg', 'alt', 'apr', 'art', 'aug', 'b',
|
7
|
+
'b.a', 'b.s', 'best', 'bgm', 'bldg', 'btw', 'buchst', 'bzgl', 'bzw', 'c',
|
8
|
+
'ca', 'co', 'd', 'd.d', 'd.h', 'd.r', 'dergl', 'dez', 'dgl', 'dr', 'dr ',
|
9
|
+
'dt', 'dzt', 'e', 'e.l', 'e.u', 'e.v', 'ehem', 'eig', 'etc', 'etc.p.p',
|
10
|
+
'eu', 'europ', 'ev', 'ev ', 'evtl', 'f', 'f.d', 'feat', 'feb', 'ff',
|
11
|
+
'fr', 'frz', 'ft', 'g', 'gg', 'ggf', 'ggü', 'griech', 'h', 'h.c', 'h.p',
|
12
|
+
'hon', 'hosp', 'hr', 'i', 'i.a', 'i.d', 'i.d.r', 'i.f', 'i.p', 'i.z',
|
13
|
+
'ii', 'iii', 'inkl', 'int', 'iv', 'ix', 'j', 'jan', 'jul', 'jun', 'k',
|
14
|
+
'k.a', 'k.i.z', 'k.o', 'k.u.k', 'kath ', 'l', 'l.a', 'lfd', 'lt', 'ltd',
|
15
|
+
'm', 'm.e', 'm.w', 'mag', 'max', 'me', 'med', 'mind', 'mio', 'mme', 'mr',
|
16
|
+
'mrd', 'mrs', 'ms', 'mwst', 'mär', 'n', 'nov', 'nr', 'o', 'o.k', 'o.ä',
|
17
|
+
'oct', 'okt', 'omg', 'oö', 'p', 'p.a', 'p.m', 'p.s', 'p.t', 'pol', 'pp',
|
18
|
+
'prof', 'präs', 'q', 'r', 'r.i.p', 'r.r', 'ranz', 'rd', 'rep', 'rt',
|
19
|
+
'russ', 's', 's.g', 'sen', 'sep', 'sog', 'st', 'std', 'str', 't', 'türk',
|
20
|
+
'u', 'u.a', 'u.a ', 'u.a.m', 'u.a.v', 'u.k', 'u.s', 'u.s.w', 'u.u',
|
21
|
+
'u.v.a', 'u.v.m', 'u.ä', 'ungar', 'usf', 'usw', 'uvm', 'v', 'v.a', 'v.d',
|
22
|
+
'v.m', 'vgl', 'vi', 'vii', 'viii', 'vs', 'w', 'wg', 'wr', 'x', 'xi',
|
23
|
+
'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'y',
|
24
|
+
'z', 'z.b', 'z.t', 'z.z', 'z.zt', 'zb', 'zt', 'zw', 'zzt', 'ä', 'ö',
|
25
|
+
'öffentl', 'öst', 'österr', 'ü'].freeze
|
26
|
+
STOP_WORDS = [
|
27
|
+
"a", "ab", "aber", "ach", "acht", "achte", "achten", "achter", "achtes",
|
28
|
+
"ag", "alle", "allein", "allem", "allen", "aller", "allerdings", "alles",
|
29
|
+
"allgemeinen", "als", "also", "am", "an", "andere", "anderen", "andern",
|
30
|
+
"anders", "au", "auch", "auf", "aus", "ausser", "ausserdem", "außer",
|
31
|
+
"außerdem", "b", "bald", "bei", "beide", "beiden", "beim", "beispiel",
|
32
|
+
"bekannt", "bereits", "besonders", "besser", "besten", "bin", "bis",
|
33
|
+
"bisher", "bist", "c", "d", "d.h", "da", "dabei", "dadurch", "dafür",
|
34
|
+
"dagegen", "daher", "dahin", "dahinter", "damals", "damit", "danach",
|
35
|
+
"daneben", "dank", "dann", "daran", "darauf", "daraus", "darf", "darfst",
|
36
|
+
"darin", "darum", "darunter", "darüber", "das", "dasein", "daselbst", "dass",
|
37
|
+
"dasselbe", "davon", "davor", "dazu", "dazwischen", "daß", "dein", "deine",
|
38
|
+
"deinem", "deiner", "dem", "dementsprechend", "demgegenüber", "demgemäss",
|
39
|
+
"demgemäß", "demselben", "demzufolge", "den", "denen", "denn", "denselben",
|
40
|
+
"der", "deren", "derjenige", "derjenigen", "dermassen", "dermaßen", "derselbe",
|
41
|
+
"derselben", "des", "deshalb", "desselben", "dessen", "deswegen", "dich", "die",
|
42
|
+
"diejenige", "diejenigen", "dies", "diese", "dieselbe", "dieselben", "diesem",
|
43
|
+
"diesen", "dieser", "dieses", "dir", "doch", "dort", "drei", "drin", "dritte",
|
44
|
+
"dritten", "dritter", "drittes", "du", "durch", "durchaus", "durfte", "durften",
|
45
|
+
"dürfen", "dürft", "e", "eben", "ebenso", "ehrlich", "ei", "ei,", "eigen",
|
46
|
+
"eigene", "eigenen", "eigener", "eigenes", "ein", "einander", "eine", "einem",
|
47
|
+
"einen", "einer", "eines", "einige", "einigen", "einiger", "einiges", "einmal",
|
48
|
+
"eins", "elf", "en", "ende", "endlich", "entweder", "er", "erst", "erste",
|
49
|
+
"ersten", "erster", "erstes", "es", "etwa", "etwas", "euch", "euer", "eure", "f",
|
50
|
+
"früher", "fünf", "fünfte", "fünften", "fünfter", "fünftes", "für", "g", "gab",
|
51
|
+
"ganz", "ganze", "ganzen", "ganzer", "ganzes", "gar", "gedurft", "gegen",
|
52
|
+
"gegenüber", "gehabt", "gehen", "geht", "gekannt", "gekonnt", "gemacht",
|
53
|
+
"gemocht", "gemusst", "genug", "gerade", "gern", "gesagt", "geschweige",
|
54
|
+
"gewesen", "gewollt", "geworden", "gibt", "ging", "gleich", "gott", "gross",
|
55
|
+
"grosse", "grossen", "grosser", "grosses", "groß", "große", "großen", "großer",
|
56
|
+
"großes", "gut", "gute", "guter", "gutes", "h", "habe", "haben", "habt", "hast",
|
57
|
+
"hat", "hatte", "hatten", "hattest", "hattet", "heisst", "her", "heute", "hier",
|
58
|
+
"hin", "hinter", "hoch", "hätte", "hätten", "i", "ich", "ihm", "ihn", "ihnen",
|
59
|
+
"ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "im", "immer", "in", "indem",
|
60
|
+
"infolgedessen", "ins", "irgend", "ist", "j", "ja", "jahr", "jahre", "jahren",
|
61
|
+
"je", "jede", "jedem", "jeden", "jeder", "jedermann", "jedermanns", "jedes",
|
62
|
+
"jedoch", "jemand", "jemandem", "jemanden", "jene", "jenem", "jenen", "jener",
|
63
|
+
"jenes", "jetzt", "k", "kam", "kann", "kannst", "kaum", "kein", "keine", "keinem",
|
64
|
+
"keinen", "keiner", "kleine", "kleinen", "kleiner", "kleines", "km", "kommen",
|
65
|
+
"kommt", "konnte", "konnten", "kurz", "können", "könnt", "könnte", "l", "lang",
|
66
|
+
"lange", "leicht", "leide", "lieber", "los", "m", "machen", "macht", "machte",
|
67
|
+
"mag", "magst", "mahn", "man", "manche", "manchem", "manchen", "mancher", "manches",
|
68
|
+
"mann", "mehr", "mein", "meine", "meinem", "meinen", "meiner", "meines", "mensch",
|
69
|
+
"menschen", "mich", "mir", "mit", "mittel", "mochte", "mochten", "morgen", "muss",
|
70
|
+
"musst", "musste", "mussten", "muß", "mußt", "möchte", "mögen", "möglich", "mögt",
|
71
|
+
"müssen", "müsst", "müßt", "n", "na", "nach", "nachdem", "nahm", "natürlich",
|
72
|
+
"neben", "nein", "neue", "neuen", "neun", "neunte", "neunten", "neunter", "neuntes",
|
73
|
+
"nicht", "nichts", "nie", "niemand", "niemandem", "niemanden", "noch", "nun", "nur",
|
74
|
+
"o", "ob", "oben", "oder", "offen", "oft", "ohne", "p", "q", "r", "recht", "rechte",
|
75
|
+
"rechten", "rechter", "rechtes", "richtig", "rund", "s", "sa", "sache", "sagt",
|
76
|
+
"sagte", "sah", "satt", "schlecht", "schon", "sechs", "sechste", "sechsten",
|
77
|
+
"sechster", "sechstes", "sehr", "sei", "seid", "seien", "sein", "seine", "seinem",
|
78
|
+
"seinen", "seiner", "seines", "seit", "seitdem", "selbst", "sich", "sie", "sieben",
|
79
|
+
"siebente", "siebenten", "siebenter", "siebentes", "sind", "so", "solang", "solche",
|
80
|
+
"solchem", "solchen", "solcher", "solches", "soll", "sollen", "sollst", "sollt",
|
81
|
+
"sollte", "sollten", "sondern", "sonst", "soweit", "sowie", "später", "statt", "t",
|
82
|
+
"tag", "tage", "tagen", "tat", "teil", "tel", "tritt", "trotzdem", "tun", "u",
|
83
|
+
"uhr", "um", "und", "und?", "uns", "unser", "unsere", "unserer", "unter", "v",
|
84
|
+
"vergangenen", "viel", "viele", "vielem", "vielen", "vielleicht", "vier", "vierte",
|
85
|
+
"vierten", "vierter", "viertes", "vom", "von", "vor", "w", "wahr?", "wann", "war",
|
86
|
+
"waren", "wart", "warum", "was", "wegen", "weil", "weit", "weiter", "weitere",
|
87
|
+
"weiteren", "weiteres", "welche", "welchem", "welchen", "welcher", "welches", "wem",
|
88
|
+
"wen", "wenig", "wenige", "weniger", "weniges", "wenigstens", "wenn", "wer", "werde",
|
89
|
+
"werden", "werdet", "weshalb", "wessen", "wie", "wieder", "wieso", "will", "willst",
|
90
|
+
"wir", "wird", "wirklich", "wirst", "wo", "woher", "wohin", "wohl", "wollen", "wollt",
|
91
|
+
"wollte", "wollten", "worden", "wurde", "wurden", "während", "währenddem",
|
92
|
+
"währenddessen", "wäre", "würde", "würden", "x", "y", "z", "z.b", "zehn", "zehnte",
|
93
|
+
"zehnten", "zehnter", "zehntes", "zeit", "zu", "zuerst", "zugleich", "zum",
|
94
|
+
"zunächst", "zur", "zurück", "zusammen", "zwanzig", "zwar", "zwei", "zweite",
|
95
|
+
"zweiten", "zweiter", "zweites", "zwischen", "zwölf", "über", "überhaupt",
|
96
|
+
"übrigens"].freeze
|
26
97
|
CONTRACTIONS = {
|
27
98
|
"auf's" => "auf das",
|
28
99
|
"can't" => "cannot",
|
@@ -63,7 +134,7 @@ module PragmaticTokenizer
|
|
63
134
|
"wird's" => "wird es",
|
64
135
|
"wär's" => "wäre es",
|
65
136
|
"ö's" => "österreichs"
|
66
|
-
}
|
137
|
+
}.freeze
|
67
138
|
end
|
68
139
|
end
|
69
|
-
end
|
140
|
+
end
|
@@ -3,8 +3,8 @@ module PragmaticTokenizer
|
|
3
3
|
module Dutch
|
4
4
|
include Languages::Common
|
5
5
|
ABBREVIATIONS = [].freeze
|
6
|
-
STOP_WORDS =
|
6
|
+
STOP_WORDS = %w(aan af al als bij dan dat die dit een en er had heb hem het hij hoe hun ik in is je kan me men met mij nog nu of ons ook te tot uit van was wat we wel wij zal ze zei zij zo zou aan aangaande aangezien achter achterna afgelopen aldaar aldus alhoewel alias alle allebei alleen alsnog altijd altoos ander andere anders anderszins behalve behoudens beide beiden ben beneden bent bepaald betreffende binnen binnenin boven bovenal bovendien bovengenoemd bovenstaand bovenvermeld buiten daar daarheen daarin daarna daarnet daarom daarop daarvanlangs de dikwijls door doorgaand dus echter eer eerdat eerder eerlang eerst elk elke enig enigszins enkel erdoor even eveneens evenwel gauw gedurende geen gehad gekund geleden gelijk gemoeten gemogen geweest gewoon gewoonweg haar hadden hare hebben hebt heeft hen hierbeneden hierboven hoewel hunne ikzelf inmiddels inzake jezelf jij jijzelf jou jouw jouwe juist jullie klaar kon konden krachtens kunnen kunt later liever maar mag meer mezelf mijn mijnent mijner mijzelf misschien mocht mochten moest moesten moet moeten mogen na naar nadat net niet noch nogal ofschoon om omdat omhoog omlaag omstreeks omtrent omver onder ondertussen ongeveer onszelf onze op opnieuw opzij over overeind overigens pas precies reeds rond rondom sedert sinds sindsdien slechts sommige spoedig steeds tamelijk tenzij terwijl thans tijdens toch toen toenmaals toenmalig totdat tussen uitgezonderd vaakwat vandaan vanuit vanwege veeleer verder vervolgens vol volgens voor vooraf vooral vooralsnog voorbij voordat voordezen voordien voorheen voorop vooruit vrij vroeg waar waarom wanneer want waren weer weg wegens weldra welk welke wie wiens wier wijzelf zelfs zichzelf zijn zijne zodra zonder zouden zowat zulke zullen zult worden wordt deze).freeze
|
7
7
|
CONTRACTIONS = {}.freeze
|
8
8
|
end
|
9
9
|
end
|
10
|
-
end
|
10
|
+
end
|