pragmatic_tokenizer 3.1.0 → 3.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0feac0e3ed363c652e85fda4dd631b6f221da44af13ef9da24e9f0be5a0cdd38
4
- data.tar.gz: adc17ceadcf5749c1c9d1abce01f28658511a9bd251e25dda2ccf333f32cd355
3
+ metadata.gz: e65a28fbe1b06c5aaed25f8fc5bce8b46b2a8dbcaeddee07139ae733fe9ed51a
4
+ data.tar.gz: 767847d1d88d74c645763fcc569060e4e3c5beeb8e41364f437562eaeb796f08
5
5
  SHA512:
6
- metadata.gz: 99bbf93e63cf667703c2c386e51592873f9c70fa20df823d15ef69cf3714fef98c48d2428d451ce9c1433c3b1d23c5a317a2ecf1fc5ed6f9be04a66c8d268773
7
- data.tar.gz: ddb68cee3ea6cf0b4d2bbe581c99a37d1ecb34de5a2c0703073cc53b27d6520dbf6d9bdd811f0bfcb244120c91290040f269eefeb002c3440d8c9fa55a2d9671
6
+ metadata.gz: 1a88cf9354785dbd50890a088f0251eac8ea817c915b46afdf9bc82a7d47a1f60522761b23144861ffb5ebb4bf7a516a4e00a60a4e0d3af621f7618bb5c95ad7
7
+ data.tar.gz: 8afec02448e6552d5d84b9d296f69185d864c1617eb5643304a5e74132aa1b6d0262d3e2ca1f88bc682af2bb39320618ae7ae8a320b0617ac1b4883d6801909f
@@ -17,7 +17,7 @@ module PragmaticTokenizer
17
17
  private
18
18
 
19
19
  def remove_non_breaking_space!
20
- gsub!(Regex::NO_BREAK_SPACE, ''.freeze)
20
+ gsub!(Regex::NO_BREAK_SPACE, ' '.freeze)
21
21
  end
22
22
 
23
23
  def shift_various_characters!
@@ -27,6 +27,7 @@ module PragmaticTokenizer
27
27
  ASTERISK = /(?:\*+)/
28
28
  UNDERSCORE = /(?:_+)/
29
29
  HYPHEN_OR_UNDERSCORE = /(?:[-_])/
30
+ LONG_WORD_SPLIT = /(?:[-_\/—–])/
30
31
  PERIOD_AND_PRIOR = /(?:(.+\.))/
31
32
  PERIOD_ONLY = /(?:(\.))/
32
33
  CONTRACTIONS = /(?:[‘’‚‛‹›'´`])/
@@ -253,7 +253,7 @@ module PragmaticTokenizer
253
253
  return token if token.length <= @long_word_split
254
254
  return token if token =~ Regex::ONLY_HASHTAG_MENTION
255
255
  return token if token =~ Regex::DOMAIN_OR_EMAIL
256
- token.split(Regex::HYPHEN_OR_UNDERSCORE)
256
+ token.split(Regex::LONG_WORD_SPLIT)
257
257
  end
258
258
 
259
259
  def chosen_case(text)
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.1.0".freeze
2
+ VERSION = "3.2.1".freeze
3
3
  end
data/lib/unicode.rb ADDED
@@ -0,0 +1,9 @@
1
+ module Unicode
2
+ def self.downcase(text)
3
+ text.downcase
4
+ end
5
+
6
+ def self.capitalize(text)
7
+ text.capitalize
8
+ end
9
+ end
@@ -17,7 +17,6 @@ Gem::Specification.new do |spec|
17
17
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
18
  spec.require_paths = ["lib"]
19
19
 
20
- spec.add_runtime_dependency "unicode"
21
20
  spec.add_development_dependency "bundler", "~> 1.9"
22
21
  spec.add_development_dependency "rake", ">= 12.3.3"
23
22
  spec.add_development_dependency "rspec"
@@ -88,6 +88,12 @@ describe PragmaticTokenizer do
88
88
  expect(pt.tokenize(text)).to eq(["#ab-cd"])
89
89
  end
90
90
 
91
+ it 'tokenizes a string #015' do
92
+ text = "In 2004, he co-founded Palantir Technologies, which offers platforms for finance companies and intelligence, defense, and law enforcement communities to integrate, visualize, and analyze the world's information."
93
+ pt = PragmaticTokenizer::Tokenizer.new
94
+ expect(pt.tokenize(text)).to eq(["in", "2004", ",", "he", "co-founded", "palantir", "technologies", ",", "which", "offers", "platforms", "for", "finance", "companies", "and", "intelligence", ",", "defense", ",", "and", "law", "enforcement", "communities", "to", "integrate", ",", "visualize", ",", "and", "analyze", "the", "world's", "information", "."])
95
+ end
96
+
91
97
  it 'handles numbers with symbols 2' do
92
98
  text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
93
99
  pt = PragmaticTokenizer::Tokenizer.new
@@ -543,6 +549,13 @@ describe PragmaticTokenizer do
543
549
  )
544
550
  expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
545
551
  end
552
+ it 'tokenizes something with a slash' do
553
+ text = "EO/AA/M/F/Veterans/Disability/Sexual Orientation/Gender Identity"
554
+ pt = PragmaticTokenizer::Tokenizer.new(
555
+ long_word_split: 1
556
+ )
557
+ expect(pt.tokenize(text)).to eq(["eo", "aa", "m", "f", "veterans", "disability", "sexual", "orientation", "gender", "identity"])
558
+ end
546
559
  end
547
560
 
548
561
  context 'option (clean)' do
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.0
4
+ version: 3.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-10 00:00:00.000000000 Z
11
+ date: 2024-08-11 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: unicode
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ">="
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: bundler
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -142,6 +128,7 @@ files:
142
128
  - lib/pragmatic_tokenizer/regex.rb
143
129
  - lib/pragmatic_tokenizer/tokenizer.rb
144
130
  - lib/pragmatic_tokenizer/version.rb
131
+ - lib/unicode.rb
145
132
  - pragmatic_tokenizer.gemspec
146
133
  - spec/languages/bulgarian_spec.rb
147
134
  - spec/languages/deutsch_spec.rb
@@ -153,7 +140,7 @@ files:
153
140
  homepage: https://github.com/diasks2/pragmatic_tokenizer
154
141
  licenses: []
155
142
  metadata: {}
156
- post_install_message:
143
+ post_install_message:
157
144
  rdoc_options: []
158
145
  require_paths:
159
146
  - lib
@@ -168,9 +155,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
168
155
  - !ruby/object:Gem::Version
169
156
  version: '0'
170
157
  requirements: []
171
- rubyforge_project:
172
- rubygems_version: 2.7.6
173
- signing_key:
158
+ rubygems_version: 3.3.26
159
+ signing_key:
174
160
  specification_version: 4
175
161
  summary: A multilingual tokenizer
176
162
  test_files: