pragmatic_tokenizer 3.1.0 → 3.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pragmatic_tokenizer/pre_processor.rb +1 -1
- data/lib/pragmatic_tokenizer/regex.rb +1 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +1 -1
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/lib/unicode.rb +9 -0
- data/pragmatic_tokenizer.gemspec +0 -1
- data/spec/languages/english_spec.rb +13 -0
- metadata +7 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e65a28fbe1b06c5aaed25f8fc5bce8b46b2a8dbcaeddee07139ae733fe9ed51a
|
4
|
+
data.tar.gz: 767847d1d88d74c645763fcc569060e4e3c5beeb8e41364f437562eaeb796f08
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a88cf9354785dbd50890a088f0251eac8ea817c915b46afdf9bc82a7d47a1f60522761b23144861ffb5ebb4bf7a516a4e00a60a4e0d3af621f7618bb5c95ad7
|
7
|
+
data.tar.gz: 8afec02448e6552d5d84b9d296f69185d864c1617eb5643304a5e74132aa1b6d0262d3e2ca1f88bc682af2bb39320618ae7ae8a320b0617ac1b4883d6801909f
|
@@ -253,7 +253,7 @@ module PragmaticTokenizer
|
|
253
253
|
return token if token.length <= @long_word_split
|
254
254
|
return token if token =~ Regex::ONLY_HASHTAG_MENTION
|
255
255
|
return token if token =~ Regex::DOMAIN_OR_EMAIL
|
256
|
-
token.split(Regex::
|
256
|
+
token.split(Regex::LONG_WORD_SPLIT)
|
257
257
|
end
|
258
258
|
|
259
259
|
def chosen_case(text)
|
data/lib/unicode.rb
ADDED
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -17,7 +17,6 @@ Gem::Specification.new do |spec|
|
|
17
17
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
18
|
spec.require_paths = ["lib"]
|
19
19
|
|
20
|
-
spec.add_runtime_dependency "unicode"
|
21
20
|
spec.add_development_dependency "bundler", "~> 1.9"
|
22
21
|
spec.add_development_dependency "rake", ">= 12.3.3"
|
23
22
|
spec.add_development_dependency "rspec"
|
@@ -88,6 +88,12 @@ describe PragmaticTokenizer do
|
|
88
88
|
expect(pt.tokenize(text)).to eq(["#ab-cd"])
|
89
89
|
end
|
90
90
|
|
91
|
+
it 'tokenizes a string #015' do
|
92
|
+
text = "In 2004, he co-founded Palantir Technologies, which offers platforms for finance companies and intelligence, defense, and law enforcement communities to integrate, visualize, and analyze the world's information."
|
93
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
94
|
+
expect(pt.tokenize(text)).to eq(["in", "2004", ",", "he", "co-founded", "palantir", "technologies", ",", "which", "offers", "platforms", "for", "finance", "companies", "and", "intelligence", ",", "defense", ",", "and", "law", "enforcement", "communities", "to", "integrate", ",", "visualize", ",", "and", "analyze", "the", "world's", "information", "."])
|
95
|
+
end
|
96
|
+
|
91
97
|
it 'handles numbers with symbols 2' do
|
92
98
|
text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
|
93
99
|
pt = PragmaticTokenizer::Tokenizer.new
|
@@ -543,6 +549,13 @@ describe PragmaticTokenizer do
|
|
543
549
|
)
|
544
550
|
expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
|
545
551
|
end
|
552
|
+
it 'tokenizes something with a slash' do
|
553
|
+
text = "EO/AA/M/F/Veterans/Disability/Sexual Orientation/Gender Identity"
|
554
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
555
|
+
long_word_split: 1
|
556
|
+
)
|
557
|
+
expect(pt.tokenize(text)).to eq(["eo", "aa", "m", "f", "veterans", "disability", "sexual", "orientation", "gender", "identity"])
|
558
|
+
end
|
546
559
|
end
|
547
560
|
|
548
561
|
context 'option (clean)' do
|
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.1
|
4
|
+
version: 3.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-08-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: unicode
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ">="
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ">="
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: bundler
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -142,6 +128,7 @@ files:
|
|
142
128
|
- lib/pragmatic_tokenizer/regex.rb
|
143
129
|
- lib/pragmatic_tokenizer/tokenizer.rb
|
144
130
|
- lib/pragmatic_tokenizer/version.rb
|
131
|
+
- lib/unicode.rb
|
145
132
|
- pragmatic_tokenizer.gemspec
|
146
133
|
- spec/languages/bulgarian_spec.rb
|
147
134
|
- spec/languages/deutsch_spec.rb
|
@@ -153,7 +140,7 @@ files:
|
|
153
140
|
homepage: https://github.com/diasks2/pragmatic_tokenizer
|
154
141
|
licenses: []
|
155
142
|
metadata: {}
|
156
|
-
post_install_message:
|
143
|
+
post_install_message:
|
157
144
|
rdoc_options: []
|
158
145
|
require_paths:
|
159
146
|
- lib
|
@@ -168,9 +155,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
168
155
|
- !ruby/object:Gem::Version
|
169
156
|
version: '0'
|
170
157
|
requirements: []
|
171
|
-
|
172
|
-
|
173
|
-
signing_key:
|
158
|
+
rubygems_version: 3.3.26
|
159
|
+
signing_key:
|
174
160
|
specification_version: 4
|
175
161
|
summary: A multilingual tokenizer
|
176
162
|
test_files:
|