pragmatic_tokenizer 3.0.5 → 3.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3992076b1304fc76da055925e851e5d61b27dea6
4
- data.tar.gz: ab52d479ad9f83018e18fa6c8966cd6213813646
3
+ metadata.gz: 746fe8bd11bb0bd75cd7553a7f52d37810a3962f
4
+ data.tar.gz: 2ec1b073ec014f15a7820297cfdaa46457b94130
5
5
  SHA512:
6
- metadata.gz: 325bba401a3cc218aa984e88828775a1718d11b8f6170d950563cdf90ef5f3a5755feaaaa6760a37a8c29fa63002c36ea48b530fb601c91ea953197e93fc7159
7
- data.tar.gz: af2d68f841b70444ce90d5ad00b4d0cb0d33ce1d72d254d4f4cecdc10e11bab5954e8757757663097e53e29c6f143039e5d00764412e78870c933e6e784157d5
6
+ metadata.gz: e0a368fe63c7fd4b6f2d0f5636abd922797ff0cc84cd41fdd728803f245d5380e746a3dba02daa585dbe26c7ac84f11f94ac18cdec928dbfe8560a1a45c833d9
7
+ data.tar.gz: 178b2cc47e431cbc6c11ddd4fecd55394dc5498cd98651c4a632f1c923b2fd2ca73ed71c65353f9d33662141b51b51b5e4d03db51dade1876d5de1c16781359f
@@ -17,7 +17,7 @@ module PragmaticTokenizer
17
17
  end
18
18
 
19
19
  def separate
20
- create_cleaned_tokens
20
+ @cleaned_tokens = create_cleaned_tokens
21
21
  replace_last_token unless @cleaned_tokens.empty?
22
22
  @cleaned_tokens
23
23
  end
@@ -25,21 +25,15 @@ module PragmaticTokenizer
25
25
  private
26
26
 
27
27
  def create_cleaned_tokens
28
- @cleaned_tokens = []
29
- @tokens.each_with_index do |token, position|
30
- if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
31
- match = Regexp.last_match(1)
32
- if abbreviation?(match)
33
- @cleaned_tokens += [match, DOT]
34
- next
35
- end
36
- end
37
- @cleaned_tokens << token
38
- end
28
+ @tokens[0..-2]
29
+ .flat_map { |token| abbreviation?(token) ? [token[0..-2], DOT] : token }
30
+ .push(@tokens.last)
39
31
  end
40
32
 
41
33
  def abbreviation?(token)
42
- !defined_abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_ABBREVIATION
34
+ return false unless token.end_with?(DOT) && token.length > 1
35
+ shortened = token.chomp(DOT)
36
+ !defined_abbreviation?(shortened) && shortened !~ REGEXP_ONLY_LETTERS && shortened !~ REGEXP_ABBREVIATION
43
37
  end
44
38
 
45
39
  def defined_abbreviation?(token)
@@ -52,7 +46,9 @@ module PragmaticTokenizer
52
46
 
53
47
  def replace_last_token
54
48
  last_token = @cleaned_tokens[-1]
55
- return if defined_abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_ENDS_WITH_DOT
49
+ return unless last_token.end_with?(DOT) && last_token.length > 1
50
+ shortened = last_token.chomp(DOT)
51
+ return if defined_abbreviation?(shortened) || last_token !~ REGEXP_ENDS_WITH_DOT
56
52
  @cleaned_tokens[-1] = Regexp.last_match(1)
57
53
  @cleaned_tokens << DOT
58
54
  end
@@ -1,26 +1,43 @@
1
1
  module PragmaticTokenizer
2
2
  class PostProcessor
3
3
 
4
- REGEX_SYMBOL = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/
5
- REGEXP_COMMAS = /^(,|‚)+/
6
- REGEXP_SINGLE_QUOTES = /(.+)(’|'|‘|`)$/
7
- REGEXP_SLASH = /^(?!(https?:|www\.))(.*)\/(.*)/
8
- REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)(.*)/
4
+ DOT = '.'.freeze
5
+ RANGE_DINGBATS = '[\u2701-\u27BE]'.freeze # e.g. ✁✎✳❄➾
6
+ RANGE_VARIATION_SELECTORS = '[\uFE00-\uFE0F]'.freeze # alter the previous character
7
+ RANGE_FULLWIDTH = '[\uFF01-\ufF1F]'.freeze # e.g. !"#'?
8
+
9
+ REGEXP_COMMAS = /^([,‚])+/
10
+ REGEXP_SINGLE_QUOTES = /(.+)([’'‘`])$/
11
+ REGEXP_SLASH = /^(?!(https?:|www\.))(.*)\//
12
+ REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)/
9
13
  REGEXP_PLUS_SIGN = /(.+)\+(.+)/
10
- REGEXP_COLON = /^(\:)(\S{2,})/
11
- REGEXP_EMOJI = /(\u{2744}[\u{FE0E}|\u{FE0F}])/
14
+ REGEXP_COLON = /^(:)(\S{2,})/
15
+ REGEXP_DINGBATS = /(#{RANGE_DINGBATS}#{RANGE_VARIATION_SELECTORS}*)/
16
+ REGEXP_ENDING_PUNCT = /(?<=\S)([#{RANGE_FULLWIDTH}!?]+)$/
17
+ REGEXP_DOMAIN = /^((https?:\/\/|)?[a-z0-9]+([\-\.][a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?)$/ix
18
+ REGEXP_EMAIL = /\S+[@@]\S+/
19
+ REGEXP_DOMAIN_START = /^(https?:|www\.|[[:alpha:]]\.)/
20
+ REGEXP_DOMAIN_END = /\.(com|net|org|edu|gov|mil|int|[[:alpha:]]{2})$/
21
+ REGEXP_DIGIT = /[[:digit:]]+/
22
+ REGEXP_PERIOD1 = /(.*\.)/
23
+ REGEXP_PERIOD2 = /(\.)/
12
24
 
13
25
  REGEX_UNIFIED1 = Regexp.union(REGEXP_SLASH,
14
26
  REGEXP_QUESTION_MARK,
15
27
  REGEXP_PLUS_SIGN,
16
28
  REGEXP_COLON,
17
- REGEXP_EMOJI,
29
+ REGEXP_DINGBATS,
18
30
  PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX,
19
31
  PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX)
20
32
 
21
33
  REGEX_UNIFIED2 = Regexp.union(REGEXP_SINGLE_QUOTES,
22
34
  REGEXP_COMMAS)
23
- REGEXP_UNKNOWN1 = /(?<=\S)([。.!!??]+)$/
35
+
36
+ REGEX_DOMAIN_EMAIL = Regexp.union(REGEXP_DOMAIN,
37
+ REGEXP_EMAIL)
38
+
39
+ REGEX_DOMAIN = Regexp.union(REGEXP_DOMAIN_START,
40
+ REGEXP_DOMAIN_END)
24
41
 
25
42
  attr_reader :text, :abbreviations, :downcase
26
43
 
@@ -31,19 +48,24 @@ module PragmaticTokenizer
31
48
  end
32
49
 
33
50
  def post_process
34
- separate_ending_punctuation(post_process_punctuation)
51
+ procs.reduce(full_stop_separated_tokens) { |a, e| a.flat_map(&e) }
35
52
  end
36
53
 
37
54
  private
38
55
 
39
- def post_process_punctuation
40
- separated = separate_ending_punctuation(full_stop_separated_tokens)
41
- procs = [unified1, split_unknown_period1, split_unknown_period2, split_emoji]
42
- procs.reduce(separated) { |a, e| a.flat_map(&e) }
56
+ # note: we need to run #separate_ending_punctuation twice. maybe there's a better solution?
57
+ def procs
58
+ [
59
+ separate_ending_punctuation,
60
+ unified1,
61
+ split_unknown_period1,
62
+ split_unknown_period2,
63
+ separate_ending_punctuation
64
+ ]
43
65
  end
44
66
 
45
- def separate_ending_punctuation(tokens)
46
- tokens.flat_map { |token| token.split(REGEXP_UNKNOWN1) }
67
+ def separate_ending_punctuation
68
+ proc { |token| token.split(REGEXP_ENDING_PUNCT) }
47
69
  end
48
70
 
49
71
  def unified1
@@ -51,64 +73,48 @@ module PragmaticTokenizer
51
73
  end
52
74
 
53
75
  def full_stop_separated_tokens
54
- FullStopSeparator.new(tokens: split_and_convert_commas_and_quotes, abbreviations: abbreviations, downcase: downcase).separate
76
+ FullStopSeparator.new(tokens: split_convert_commas_quotes, abbreviations: abbreviations, downcase: downcase).separate
55
77
  end
56
78
 
57
- def split_and_convert_commas_and_quotes
79
+ def split_convert_commas_quotes
58
80
  text
59
81
  .split
60
82
  .flat_map { |token| token.split(REGEX_UNIFIED2) }
61
83
  .flat_map { |token| convert_sym_to_punct(token) }
62
84
  end
63
85
 
64
- def split_emoji
65
- proc { |token| (token =~ /(\A|\S)\u{2744}[^\u{FE0E}|\u{FE0F}]/) ? token.split(/(\u{2744})/) : token }
66
- end
67
-
68
86
  def split_unknown_period1
69
- proc { |token| unknown_period1?(token) ? token.split(/(.*\.)/) : token }
87
+ proc { |token| unknown_period1?(token) ? token.split(REGEXP_PERIOD1) : token }
70
88
  end
71
89
 
72
90
  def split_unknown_period2
73
- proc { |token| unknown_period2?(token) ? token.split(/(\.)/) : token }
91
+ proc { |token| unknown_period2?(token) ? token.split(REGEXP_PERIOD2) : token }
74
92
  end
75
93
 
76
94
  def unknown_period1?(token)
77
- token.include?(".") &&
78
- token !~ /(http|https|www)(\.|:)/ &&
95
+ token.include?(DOT) &&
79
96
  token.length > 1 &&
80
- token !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
81
- token !~ /\S+(@|@)\S+/ &&
97
+ token !~ REGEX_DOMAIN_EMAIL &&
82
98
  abbreviations.include?(extract_abbreviation(token))
83
99
  end
84
100
 
85
101
  def unknown_period2?(token)
86
- token.include?(".") &&
87
- token !~ /(http|https|www)(\.|:)/ &&
88
- token !~ /\.(com|net|org|edu|gov|mil|int)/ &&
89
- token !~ /\.[a-zA-Z]{2}(\s|\z)/ &&
90
- token.length > 2 &&
91
- token !~ /\A[a-zA-Z]{1}\./ &&
92
- token.count(".") == 1 &&
93
- token !~ /\d+/ &&
94
- !abbreviations.include?(extract_abbreviation(token)) &&
95
- token !~ /\S+(@|@)\S+/
102
+ token.include?(DOT) &&
103
+ token !~ REGEX_DOMAIN &&
104
+ token !~ REGEXP_DIGIT &&
105
+ token.count(DOT) == 1 &&
106
+ !abbreviations.include?(extract_abbreviation(token))
96
107
  end
97
108
 
98
109
  def extract_abbreviation(token)
99
- before_first_dot = token[0, token.index('.'.freeze)]
110
+ before_first_dot = token[0, token.index(DOT)]
100
111
  downcase ? before_first_dot : Unicode.downcase(before_first_dot)
101
112
  end
102
113
 
103
114
  def convert_sym_to_punct(token)
104
- symbol_matches = REGEX_SYMBOL.match(token)
105
- if symbol_matches.nil?
106
- token
107
- else
108
- pattern = symbol_matches[0]
109
- replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(pattern)
110
- token.gsub!(pattern, replacement)
111
- end
115
+ PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP
116
+ .each { |pattern, replacement| break if token.sub!(replacement, pattern) }
117
+ token
112
118
  end
113
119
 
114
120
  end
@@ -64,7 +64,7 @@ module PragmaticTokenizer
64
64
  REGEXP_NO_NUMBERS = /\A\D+\z/
65
65
  REGEXP_NUMBER = /\D*\d+\d*/
66
66
  REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
67
- REGEXP_CHUNK_STRING = /.{,10000}(?=\s|\z)/m
67
+ REGEXP_CHUNK_STRING = /\S.{1,10000}(?!\S)/m
68
68
 
69
69
  # @param [Hash] opts optional arguments
70
70
 
@@ -150,7 +150,7 @@ module PragmaticTokenizer
150
150
 
151
151
  def tokenize(text)
152
152
  return [] unless text
153
- raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
153
+ raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
154
154
  CGI.unescapeHTML(text)
155
155
  .scan(REGEXP_CHUNK_STRING)
156
156
  .flat_map { |segment| post_process(pre_process(segment)) }
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.0.5".freeze
2
+ VERSION = "3.0.6".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.5
4
+ version: 3.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-09-19 00:00:00.000000000 Z
11
+ date: 2018-03-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -169,7 +169,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
169
169
  version: '0'
170
170
  requirements: []
171
171
  rubyforge_project:
172
- rubygems_version: 2.6.12
172
+ rubygems_version: 2.6.14
173
173
  signing_key:
174
174
  specification_version: 4
175
175
  summary: A multilingual tokenizer