pragmatic_tokenizer 3.0.4 → 3.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. checksums.yaml +5 -5
  2. data/lib/pragmatic_tokenizer/languages.rb +26 -26
  3. data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
  4. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
  5. data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
  6. data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
  7. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  8. data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
  9. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  10. data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
  11. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  12. data/lib/pragmatic_tokenizer/languages/french.rb +36 -9
  13. data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  15. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  16. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  17. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  19. data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
  22. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  23. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  25. data/lib/pragmatic_tokenizer/post_processor.rb +42 -88
  26. data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
  27. data/lib/pragmatic_tokenizer/regex.rb +150 -0
  28. data/lib/pragmatic_tokenizer/tokenizer.rb +81 -115
  29. data/lib/pragmatic_tokenizer/version.rb +1 -1
  30. data/pragmatic_tokenizer.gemspec +5 -6
  31. data/spec/languages/english_spec.rb +13 -0
  32. data/spec/languages/french_spec.rb +2 -2
  33. data/spec/performance_spec.rb +0 -1
  34. data/spec/spec_helper.rb +1 -1
  35. metadata +12 -12
  36. data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -62
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.0.4".freeze
2
+ VERSION = "3.2.0".freeze
3
3
  end
@@ -1,5 +1,4 @@
1
- # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
1
+ lib = File.expand_path('lib', __dir__)
3
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
3
  require 'pragmatic_tokenizer/version'
5
4
 
@@ -9,9 +8,9 @@ Gem::Specification.new do |spec|
9
8
  spec.authors = ["Kevin S. Dias"]
10
9
  spec.email = ["diasks2@gmail.com"]
11
10
 
12
- spec.summary = %q{A multilingual tokenizer}
13
- spec.description = %q{A multilingual tokenizer to split a string into tokens.}
14
- spec.homepage = "https://github.com/diasks2/pragmatic_tokenizer"
11
+ spec.summary = 'A multilingual tokenizer'
12
+ spec.description = 'A multilingual tokenizer to split a string into tokens.'
13
+ spec.homepage = 'https://github.com/diasks2/pragmatic_tokenizer'
15
14
 
16
15
  spec.files = `git ls-files -z`.split("\x0")
17
16
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
@@ -20,7 +19,7 @@ Gem::Specification.new do |spec|
20
19
 
21
20
  spec.add_runtime_dependency "unicode"
22
21
  spec.add_development_dependency "bundler", "~> 1.9"
23
- spec.add_development_dependency "rake", "~> 10.0"
22
+ spec.add_development_dependency "rake", ">= 12.3.3"
24
23
  spec.add_development_dependency "rspec"
25
24
  spec.add_development_dependency "stackprof"
26
25
  spec.add_development_dependency "rubocop"
@@ -88,6 +88,12 @@ describe PragmaticTokenizer do
88
88
  expect(pt.tokenize(text)).to eq(["#ab-cd"])
89
89
  end
90
90
 
91
+ it 'tokenizes a string #015' do
92
+ text = "In 2004, he co-founded Palantir Technologies, which offers platforms for finance companies and intelligence, defense, and law enforcement communities to integrate, visualize, and analyze the world's information."
93
+ pt = PragmaticTokenizer::Tokenizer.new
94
+ expect(pt.tokenize(text)).to eq(["in", "2004", ",", "he", "co-founded", "palantir", "technologies", ",", "which", "offers", "platforms", "for", "finance", "companies", "and", "intelligence", ",", "defense", ",", "and", "law", "enforcement", "communities", "to", "integrate", ",", "visualize", ",", "and", "analyze", "the", "world's", "information", "."])
95
+ end
96
+
91
97
  it 'handles numbers with symbols 2' do
92
98
  text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
93
99
  pt = PragmaticTokenizer::Tokenizer.new
@@ -543,6 +549,13 @@ describe PragmaticTokenizer do
543
549
  )
544
550
  expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
545
551
  end
552
+ it 'tokenizes something with a slash' do
553
+ text = "EO/AA/M/F/Veterans/Disability/Sexual Orientation/Gender Identity"
554
+ pt = PragmaticTokenizer::Tokenizer.new(
555
+ long_word_split: 1
556
+ )
557
+ expect(pt.tokenize(text)).to eq(["eo", "aa", "m", "f", "veterans", "disability", "sexual", "orientation", "gender", "identity"])
558
+ end
546
559
  end
547
560
 
548
561
  context 'option (clean)' do
@@ -3,11 +3,11 @@ require 'spec_helper'
3
3
  describe PragmaticTokenizer do
4
4
  context 'Language: French (fr)' do
5
5
  it 'tokenizes a string #001' do
6
- text = "L'art de l'univers, c'est un art"
6
+ text = "D'art de l'univers, c'est un art"
7
7
  pt = PragmaticTokenizer::Tokenizer.new(
8
8
  language: 'fr'
9
9
  )
10
- expect(pt.tokenize(text)).to eq(["l'", "art", "de", "l'", "univers", ",", "c'est", "un", "art"])
10
+ expect(pt.tokenize(text)).to eq(["d'", "art", "de", "l'", "univers", ",", "c'" ,"est", "un", "art"])
11
11
  end
12
12
  end
13
13
  end
@@ -1,4 +1,3 @@
1
- # -*- encoding : utf-8 -*-
2
1
  require 'benchmark'
3
2
  require 'spec_helper'
4
3
  require 'stackprof'
@@ -1,2 +1,2 @@
1
- $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
1
+ $LOAD_PATH.unshift File.expand_path('../lib', __dir__)
2
2
  require 'pragmatic_tokenizer'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.4
4
+ version: 3.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-02 00:00:00.000000000 Z
11
+ date: 2020-11-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -42,16 +42,16 @@ dependencies:
42
42
  name: rake
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: '10.0'
47
+ version: 12.3.3
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: '10.0'
54
+ version: 12.3.3
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: rspec
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -111,7 +111,6 @@ files:
111
111
  - README.md
112
112
  - Rakefile
113
113
  - lib/pragmatic_tokenizer.rb
114
- - lib/pragmatic_tokenizer/full_stop_separator.rb
115
114
  - lib/pragmatic_tokenizer/languages.rb
116
115
  - lib/pragmatic_tokenizer/languages/arabic.rb
117
116
  - lib/pragmatic_tokenizer/languages/bulgarian.rb
@@ -140,6 +139,7 @@ files:
140
139
  - lib/pragmatic_tokenizer/languages/turkish.rb
141
140
  - lib/pragmatic_tokenizer/post_processor.rb
142
141
  - lib/pragmatic_tokenizer/pre_processor.rb
142
+ - lib/pragmatic_tokenizer/regex.rb
143
143
  - lib/pragmatic_tokenizer/tokenizer.rb
144
144
  - lib/pragmatic_tokenizer/version.rb
145
145
  - pragmatic_tokenizer.gemspec
@@ -153,7 +153,7 @@ files:
153
153
  homepage: https://github.com/diasks2/pragmatic_tokenizer
154
154
  licenses: []
155
155
  metadata: {}
156
- post_install_message:
156
+ post_install_message:
157
157
  rdoc_options: []
158
158
  require_paths:
159
159
  - lib
@@ -168,9 +168,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
168
168
  - !ruby/object:Gem::Version
169
169
  version: '0'
170
170
  requirements: []
171
- rubyforge_project:
172
- rubygems_version: 2.4.1
173
- signing_key:
171
+ rubyforge_project:
172
+ rubygems_version: 2.7.6
173
+ signing_key:
174
174
  specification_version: 4
175
175
  summary: A multilingual tokenizer
176
176
  test_files:
@@ -1,62 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module PragmaticTokenizer
4
- # This class separates true full stops while ignoring
5
- # periods that are part of an abbreviation
6
- class FullStopSeparator
7
-
8
- REGEXP_ENDS_WITH_DOT = /\A(.*\w)\.\z/
9
- REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
10
- REGEXP_ABBREVIATION = /[a-z](?:\.[a-z])+\z/i
11
- DOT = '.'.freeze
12
-
13
- def initialize(tokens:, abbreviations:, downcase:)
14
- @tokens = tokens
15
- @abbreviations = abbreviations
16
- @downcase = downcase
17
- end
18
-
19
- def separate
20
- create_cleaned_tokens
21
- replace_last_token unless @cleaned_tokens.empty?
22
- @cleaned_tokens
23
- end
24
-
25
- private
26
-
27
- def create_cleaned_tokens
28
- @cleaned_tokens = []
29
- @tokens.each_with_index do |token, position|
30
- if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
31
- match = Regexp.last_match(1)
32
- if abbreviation?(match)
33
- @cleaned_tokens += [match, DOT]
34
- next
35
- end
36
- end
37
- @cleaned_tokens << token
38
- end
39
- end
40
-
41
- def abbreviation?(token)
42
- !defined_abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_ABBREVIATION
43
- end
44
-
45
- def defined_abbreviation?(token)
46
- @abbreviations.include?(inverse_case(token))
47
- end
48
-
49
- def inverse_case(token)
50
- @downcase ? token : Unicode.downcase(token)
51
- end
52
-
53
- def replace_last_token
54
- last_token = @cleaned_tokens[-1]
55
- return if defined_abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_ENDS_WITH_DOT
56
- @cleaned_tokens[-1] = Regexp.last_match(1)
57
- @cleaned_tokens << DOT
58
- end
59
-
60
- end
61
-
62
- end