pragmatic_tokenizer 3.0.4 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +5 -5
  2. data/lib/pragmatic_tokenizer/languages.rb +26 -26
  3. data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
  4. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
  5. data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
  6. data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
  7. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  8. data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
  9. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  10. data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
  11. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  12. data/lib/pragmatic_tokenizer/languages/french.rb +36 -9
  13. data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  15. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  16. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  17. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  19. data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
  22. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  23. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  25. data/lib/pragmatic_tokenizer/post_processor.rb +42 -88
  26. data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
  27. data/lib/pragmatic_tokenizer/regex.rb +150 -0
  28. data/lib/pragmatic_tokenizer/tokenizer.rb +81 -115
  29. data/lib/pragmatic_tokenizer/version.rb +1 -1
  30. data/pragmatic_tokenizer.gemspec +5 -6
  31. data/spec/languages/english_spec.rb +13 -0
  32. data/spec/languages/french_spec.rb +2 -2
  33. data/spec/performance_spec.rb +0 -1
  34. data/spec/spec_helper.rb +1 -1
  35. metadata +12 -12
  36. data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -62
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.0.4".freeze
2
+ VERSION = "3.2.0".freeze
3
3
  end
@@ -1,5 +1,4 @@
1
- # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
1
+ lib = File.expand_path('lib', __dir__)
3
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
3
  require 'pragmatic_tokenizer/version'
5
4
 
@@ -9,9 +8,9 @@ Gem::Specification.new do |spec|
9
8
  spec.authors = ["Kevin S. Dias"]
10
9
  spec.email = ["diasks2@gmail.com"]
11
10
 
12
- spec.summary = %q{A multilingual tokenizer}
13
- spec.description = %q{A multilingual tokenizer to split a string into tokens.}
14
- spec.homepage = "https://github.com/diasks2/pragmatic_tokenizer"
11
+ spec.summary = 'A multilingual tokenizer'
12
+ spec.description = 'A multilingual tokenizer to split a string into tokens.'
13
+ spec.homepage = 'https://github.com/diasks2/pragmatic_tokenizer'
15
14
 
16
15
  spec.files = `git ls-files -z`.split("\x0")
17
16
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
@@ -20,7 +19,7 @@ Gem::Specification.new do |spec|
20
19
 
21
20
  spec.add_runtime_dependency "unicode"
22
21
  spec.add_development_dependency "bundler", "~> 1.9"
23
- spec.add_development_dependency "rake", "~> 10.0"
22
+ spec.add_development_dependency "rake", ">= 12.3.3"
24
23
  spec.add_development_dependency "rspec"
25
24
  spec.add_development_dependency "stackprof"
26
25
  spec.add_development_dependency "rubocop"
@@ -88,6 +88,12 @@ describe PragmaticTokenizer do
88
88
  expect(pt.tokenize(text)).to eq(["#ab-cd"])
89
89
  end
90
90
 
91
+ it 'tokenizes a string #015' do
92
+ text = "In 2004, he co-founded Palantir Technologies, which offers platforms for finance companies and intelligence, defense, and law enforcement communities to integrate, visualize, and analyze the world's information."
93
+ pt = PragmaticTokenizer::Tokenizer.new
94
+ expect(pt.tokenize(text)).to eq(["in", "2004", ",", "he", "co-founded", "palantir", "technologies", ",", "which", "offers", "platforms", "for", "finance", "companies", "and", "intelligence", ",", "defense", ",", "and", "law", "enforcement", "communities", "to", "integrate", ",", "visualize", ",", "and", "analyze", "the", "world's", "information", "."])
95
+ end
96
+
91
97
  it 'handles numbers with symbols 2' do
92
98
  text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
93
99
  pt = PragmaticTokenizer::Tokenizer.new
@@ -543,6 +549,13 @@ describe PragmaticTokenizer do
543
549
  )
544
550
  expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
545
551
  end
552
+ it 'tokenizes something with a slash' do
553
+ text = "EO/AA/M/F/Veterans/Disability/Sexual Orientation/Gender Identity"
554
+ pt = PragmaticTokenizer::Tokenizer.new(
555
+ long_word_split: 1
556
+ )
557
+ expect(pt.tokenize(text)).to eq(["eo", "aa", "m", "f", "veterans", "disability", "sexual", "orientation", "gender", "identity"])
558
+ end
546
559
  end
547
560
 
548
561
  context 'option (clean)' do
@@ -3,11 +3,11 @@ require 'spec_helper'
3
3
  describe PragmaticTokenizer do
4
4
  context 'Language: French (fr)' do
5
5
  it 'tokenizes a string #001' do
6
- text = "L'art de l'univers, c'est un art"
6
+ text = "D'art de l'univers, c'est un art"
7
7
  pt = PragmaticTokenizer::Tokenizer.new(
8
8
  language: 'fr'
9
9
  )
10
- expect(pt.tokenize(text)).to eq(["l'", "art", "de", "l'", "univers", ",", "c'est", "un", "art"])
10
+ expect(pt.tokenize(text)).to eq(["d'", "art", "de", "l'", "univers", ",", "c'" ,"est", "un", "art"])
11
11
  end
12
12
  end
13
13
  end
@@ -1,4 +1,3 @@
1
- # -*- encoding : utf-8 -*-
2
1
  require 'benchmark'
3
2
  require 'spec_helper'
4
3
  require 'stackprof'
@@ -1,2 +1,2 @@
1
- $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
1
+ $LOAD_PATH.unshift File.expand_path('../lib', __dir__)
2
2
  require 'pragmatic_tokenizer'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.4
4
+ version: 3.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-02 00:00:00.000000000 Z
11
+ date: 2020-11-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -42,16 +42,16 @@ dependencies:
42
42
  name: rake
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: '10.0'
47
+ version: 12.3.3
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: '10.0'
54
+ version: 12.3.3
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: rspec
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -111,7 +111,6 @@ files:
111
111
  - README.md
112
112
  - Rakefile
113
113
  - lib/pragmatic_tokenizer.rb
114
- - lib/pragmatic_tokenizer/full_stop_separator.rb
115
114
  - lib/pragmatic_tokenizer/languages.rb
116
115
  - lib/pragmatic_tokenizer/languages/arabic.rb
117
116
  - lib/pragmatic_tokenizer/languages/bulgarian.rb
@@ -140,6 +139,7 @@ files:
140
139
  - lib/pragmatic_tokenizer/languages/turkish.rb
141
140
  - lib/pragmatic_tokenizer/post_processor.rb
142
141
  - lib/pragmatic_tokenizer/pre_processor.rb
142
+ - lib/pragmatic_tokenizer/regex.rb
143
143
  - lib/pragmatic_tokenizer/tokenizer.rb
144
144
  - lib/pragmatic_tokenizer/version.rb
145
145
  - pragmatic_tokenizer.gemspec
@@ -153,7 +153,7 @@ files:
153
153
  homepage: https://github.com/diasks2/pragmatic_tokenizer
154
154
  licenses: []
155
155
  metadata: {}
156
- post_install_message:
156
+ post_install_message:
157
157
  rdoc_options: []
158
158
  require_paths:
159
159
  - lib
@@ -168,9 +168,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
168
168
  - !ruby/object:Gem::Version
169
169
  version: '0'
170
170
  requirements: []
171
- rubyforge_project:
172
- rubygems_version: 2.4.1
173
- signing_key:
171
+ rubyforge_project:
172
+ rubygems_version: 2.7.6
173
+ signing_key:
174
174
  specification_version: 4
175
175
  summary: A multilingual tokenizer
176
176
  test_files:
@@ -1,62 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module PragmaticTokenizer
4
- # This class separates true full stops while ignoring
5
- # periods that are part of an abbreviation
6
- class FullStopSeparator
7
-
8
- REGEXP_ENDS_WITH_DOT = /\A(.*\w)\.\z/
9
- REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
10
- REGEXP_ABBREVIATION = /[a-z](?:\.[a-z])+\z/i
11
- DOT = '.'.freeze
12
-
13
- def initialize(tokens:, abbreviations:, downcase:)
14
- @tokens = tokens
15
- @abbreviations = abbreviations
16
- @downcase = downcase
17
- end
18
-
19
- def separate
20
- create_cleaned_tokens
21
- replace_last_token unless @cleaned_tokens.empty?
22
- @cleaned_tokens
23
- end
24
-
25
- private
26
-
27
- def create_cleaned_tokens
28
- @cleaned_tokens = []
29
- @tokens.each_with_index do |token, position|
30
- if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
31
- match = Regexp.last_match(1)
32
- if abbreviation?(match)
33
- @cleaned_tokens += [match, DOT]
34
- next
35
- end
36
- end
37
- @cleaned_tokens << token
38
- end
39
- end
40
-
41
- def abbreviation?(token)
42
- !defined_abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_ABBREVIATION
43
- end
44
-
45
- def defined_abbreviation?(token)
46
- @abbreviations.include?(inverse_case(token))
47
- end
48
-
49
- def inverse_case(token)
50
- @downcase ? token : Unicode.downcase(token)
51
- end
52
-
53
- def replace_last_token
54
- last_token = @cleaned_tokens[-1]
55
- return if defined_abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_ENDS_WITH_DOT
56
- @cleaned_tokens[-1] = Regexp.last_match(1)
57
- @cleaned_tokens << DOT
58
- end
59
-
60
- end
61
-
62
- end