pragmatic_tokenizer 3.0.5 → 3.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3992076b1304fc76da055925e851e5d61b27dea6
4
- data.tar.gz: ab52d479ad9f83018e18fa6c8966cd6213813646
3
+ metadata.gz: 746fe8bd11bb0bd75cd7553a7f52d37810a3962f
4
+ data.tar.gz: 2ec1b073ec014f15a7820297cfdaa46457b94130
5
5
  SHA512:
6
- metadata.gz: 325bba401a3cc218aa984e88828775a1718d11b8f6170d950563cdf90ef5f3a5755feaaaa6760a37a8c29fa63002c36ea48b530fb601c91ea953197e93fc7159
7
- data.tar.gz: af2d68f841b70444ce90d5ad00b4d0cb0d33ce1d72d254d4f4cecdc10e11bab5954e8757757663097e53e29c6f143039e5d00764412e78870c933e6e784157d5
6
+ metadata.gz: e0a368fe63c7fd4b6f2d0f5636abd922797ff0cc84cd41fdd728803f245d5380e746a3dba02daa585dbe26c7ac84f11f94ac18cdec928dbfe8560a1a45c833d9
7
+ data.tar.gz: 178b2cc47e431cbc6c11ddd4fecd55394dc5498cd98651c4a632f1c923b2fd2ca73ed71c65353f9d33662141b51b51b5e4d03db51dade1876d5de1c16781359f
@@ -17,7 +17,7 @@ module PragmaticTokenizer
17
17
  end
18
18
 
19
19
  def separate
20
- create_cleaned_tokens
20
+ @cleaned_tokens = create_cleaned_tokens
21
21
  replace_last_token unless @cleaned_tokens.empty?
22
22
  @cleaned_tokens
23
23
  end
@@ -25,21 +25,15 @@ module PragmaticTokenizer
25
25
  private
26
26
 
27
27
  def create_cleaned_tokens
28
- @cleaned_tokens = []
29
- @tokens.each_with_index do |token, position|
30
- if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
31
- match = Regexp.last_match(1)
32
- if abbreviation?(match)
33
- @cleaned_tokens += [match, DOT]
34
- next
35
- end
36
- end
37
- @cleaned_tokens << token
38
- end
28
+ @tokens[0..-2]
29
+ .flat_map { |token| abbreviation?(token) ? [token[0..-2], DOT] : token }
30
+ .push(@tokens.last)
39
31
  end
40
32
 
41
33
  def abbreviation?(token)
42
- !defined_abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_ABBREVIATION
34
+ return false unless token.end_with?(DOT) && token.length > 1
35
+ shortened = token.chomp(DOT)
36
+ !defined_abbreviation?(shortened) && shortened !~ REGEXP_ONLY_LETTERS && shortened !~ REGEXP_ABBREVIATION
43
37
  end
44
38
 
45
39
  def defined_abbreviation?(token)
@@ -52,7 +46,9 @@ module PragmaticTokenizer
52
46
 
53
47
  def replace_last_token
54
48
  last_token = @cleaned_tokens[-1]
55
- return if defined_abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_ENDS_WITH_DOT
49
+ return unless last_token.end_with?(DOT) && last_token.length > 1
50
+ shortened = last_token.chomp(DOT)
51
+ return if defined_abbreviation?(shortened) || last_token !~ REGEXP_ENDS_WITH_DOT
56
52
  @cleaned_tokens[-1] = Regexp.last_match(1)
57
53
  @cleaned_tokens << DOT
58
54
  end
@@ -1,26 +1,43 @@
1
1
  module PragmaticTokenizer
2
2
  class PostProcessor
3
3
 
4
- REGEX_SYMBOL = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/
5
- REGEXP_COMMAS = /^(,|‚)+/
6
- REGEXP_SINGLE_QUOTES = /(.+)(’|'|‘|`)$/
7
- REGEXP_SLASH = /^(?!(https?:|www\.))(.*)\/(.*)/
8
- REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)(.*)/
4
+ DOT = '.'.freeze
5
+ RANGE_DINGBATS = '[\u2701-\u27BE]'.freeze # e.g. ✁✎✳❄➾
6
+ RANGE_VARIATION_SELECTORS = '[\uFE00-\uFE0F]'.freeze # alter the previous character
7
+ RANGE_FULLWIDTH = '[\uFF01-\ufF1F]'.freeze # e.g. !"#'?
8
+
9
+ REGEXP_COMMAS = /^([,‚])+/
10
+ REGEXP_SINGLE_QUOTES = /(.+)([’'‘`])$/
11
+ REGEXP_SLASH = /^(?!(https?:|www\.))(.*)\//
12
+ REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)/
9
13
  REGEXP_PLUS_SIGN = /(.+)\+(.+)/
10
- REGEXP_COLON = /^(\:)(\S{2,})/
11
- REGEXP_EMOJI = /(\u{2744}[\u{FE0E}|\u{FE0F}])/
14
+ REGEXP_COLON = /^(:)(\S{2,})/
15
+ REGEXP_DINGBATS = /(#{RANGE_DINGBATS}#{RANGE_VARIATION_SELECTORS}*)/
16
+ REGEXP_ENDING_PUNCT = /(?<=\S)([#{RANGE_FULLWIDTH}!?]+)$/
17
+ REGEXP_DOMAIN = /^((https?:\/\/|)?[a-z0-9]+([\-\.][a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?)$/ix
18
+ REGEXP_EMAIL = /\S+[@@]\S+/
19
+ REGEXP_DOMAIN_START = /^(https?:|www\.|[[:alpha:]]\.)/
20
+ REGEXP_DOMAIN_END = /\.(com|net|org|edu|gov|mil|int|[[:alpha:]]{2})$/
21
+ REGEXP_DIGIT = /[[:digit:]]+/
22
+ REGEXP_PERIOD1 = /(.*\.)/
23
+ REGEXP_PERIOD2 = /(\.)/
12
24
 
13
25
  REGEX_UNIFIED1 = Regexp.union(REGEXP_SLASH,
14
26
  REGEXP_QUESTION_MARK,
15
27
  REGEXP_PLUS_SIGN,
16
28
  REGEXP_COLON,
17
- REGEXP_EMOJI,
29
+ REGEXP_DINGBATS,
18
30
  PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX,
19
31
  PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX)
20
32
 
21
33
  REGEX_UNIFIED2 = Regexp.union(REGEXP_SINGLE_QUOTES,
22
34
  REGEXP_COMMAS)
23
- REGEXP_UNKNOWN1 = /(?<=\S)([。.!!??]+)$/
35
+
36
+ REGEX_DOMAIN_EMAIL = Regexp.union(REGEXP_DOMAIN,
37
+ REGEXP_EMAIL)
38
+
39
+ REGEX_DOMAIN = Regexp.union(REGEXP_DOMAIN_START,
40
+ REGEXP_DOMAIN_END)
24
41
 
25
42
  attr_reader :text, :abbreviations, :downcase
26
43
 
@@ -31,19 +48,24 @@ module PragmaticTokenizer
31
48
  end
32
49
 
33
50
  def post_process
34
- separate_ending_punctuation(post_process_punctuation)
51
+ procs.reduce(full_stop_separated_tokens) { |a, e| a.flat_map(&e) }
35
52
  end
36
53
 
37
54
  private
38
55
 
39
- def post_process_punctuation
40
- separated = separate_ending_punctuation(full_stop_separated_tokens)
41
- procs = [unified1, split_unknown_period1, split_unknown_period2, split_emoji]
42
- procs.reduce(separated) { |a, e| a.flat_map(&e) }
56
+ # note: we need to run #separate_ending_punctuation twice. maybe there's a better solution?
57
+ def procs
58
+ [
59
+ separate_ending_punctuation,
60
+ unified1,
61
+ split_unknown_period1,
62
+ split_unknown_period2,
63
+ separate_ending_punctuation
64
+ ]
43
65
  end
44
66
 
45
- def separate_ending_punctuation(tokens)
46
- tokens.flat_map { |token| token.split(REGEXP_UNKNOWN1) }
67
+ def separate_ending_punctuation
68
+ proc { |token| token.split(REGEXP_ENDING_PUNCT) }
47
69
  end
48
70
 
49
71
  def unified1
@@ -51,64 +73,48 @@ module PragmaticTokenizer
51
73
  end
52
74
 
53
75
  def full_stop_separated_tokens
54
- FullStopSeparator.new(tokens: split_and_convert_commas_and_quotes, abbreviations: abbreviations, downcase: downcase).separate
76
+ FullStopSeparator.new(tokens: split_convert_commas_quotes, abbreviations: abbreviations, downcase: downcase).separate
55
77
  end
56
78
 
57
- def split_and_convert_commas_and_quotes
79
+ def split_convert_commas_quotes
58
80
  text
59
81
  .split
60
82
  .flat_map { |token| token.split(REGEX_UNIFIED2) }
61
83
  .flat_map { |token| convert_sym_to_punct(token) }
62
84
  end
63
85
 
64
- def split_emoji
65
- proc { |token| (token =~ /(\A|\S)\u{2744}[^\u{FE0E}|\u{FE0F}]/) ? token.split(/(\u{2744})/) : token }
66
- end
67
-
68
86
  def split_unknown_period1
69
- proc { |token| unknown_period1?(token) ? token.split(/(.*\.)/) : token }
87
+ proc { |token| unknown_period1?(token) ? token.split(REGEXP_PERIOD1) : token }
70
88
  end
71
89
 
72
90
  def split_unknown_period2
73
- proc { |token| unknown_period2?(token) ? token.split(/(\.)/) : token }
91
+ proc { |token| unknown_period2?(token) ? token.split(REGEXP_PERIOD2) : token }
74
92
  end
75
93
 
76
94
  def unknown_period1?(token)
77
- token.include?(".") &&
78
- token !~ /(http|https|www)(\.|:)/ &&
95
+ token.include?(DOT) &&
79
96
  token.length > 1 &&
80
- token !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
81
- token !~ /\S+(@|@)\S+/ &&
97
+ token !~ REGEX_DOMAIN_EMAIL &&
82
98
  abbreviations.include?(extract_abbreviation(token))
83
99
  end
84
100
 
85
101
  def unknown_period2?(token)
86
- token.include?(".") &&
87
- token !~ /(http|https|www)(\.|:)/ &&
88
- token !~ /\.(com|net|org|edu|gov|mil|int)/ &&
89
- token !~ /\.[a-zA-Z]{2}(\s|\z)/ &&
90
- token.length > 2 &&
91
- token !~ /\A[a-zA-Z]{1}\./ &&
92
- token.count(".") == 1 &&
93
- token !~ /\d+/ &&
94
- !abbreviations.include?(extract_abbreviation(token)) &&
95
- token !~ /\S+(@|@)\S+/
102
+ token.include?(DOT) &&
103
+ token !~ REGEX_DOMAIN &&
104
+ token !~ REGEXP_DIGIT &&
105
+ token.count(DOT) == 1 &&
106
+ !abbreviations.include?(extract_abbreviation(token))
96
107
  end
97
108
 
98
109
  def extract_abbreviation(token)
99
- before_first_dot = token[0, token.index('.'.freeze)]
110
+ before_first_dot = token[0, token.index(DOT)]
100
111
  downcase ? before_first_dot : Unicode.downcase(before_first_dot)
101
112
  end
102
113
 
103
114
  def convert_sym_to_punct(token)
104
- symbol_matches = REGEX_SYMBOL.match(token)
105
- if symbol_matches.nil?
106
- token
107
- else
108
- pattern = symbol_matches[0]
109
- replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(pattern)
110
- token.gsub!(pattern, replacement)
111
- end
115
+ PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP
116
+ .each { |pattern, replacement| break if token.sub!(replacement, pattern) }
117
+ token
112
118
  end
113
119
 
114
120
  end
@@ -64,7 +64,7 @@ module PragmaticTokenizer
64
64
  REGEXP_NO_NUMBERS = /\A\D+\z/
65
65
  REGEXP_NUMBER = /\D*\d+\d*/
66
66
  REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
67
- REGEXP_CHUNK_STRING = /.{,10000}(?=\s|\z)/m
67
+ REGEXP_CHUNK_STRING = /\S.{1,10000}(?!\S)/m
68
68
 
69
69
  # @param [Hash] opts optional arguments
70
70
 
@@ -150,7 +150,7 @@ module PragmaticTokenizer
150
150
 
151
151
  def tokenize(text)
152
152
  return [] unless text
153
- raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
153
+ raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
154
154
  CGI.unescapeHTML(text)
155
155
  .scan(REGEXP_CHUNK_STRING)
156
156
  .flat_map { |segment| post_process(pre_process(segment)) }
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.0.5".freeze
2
+ VERSION = "3.0.6".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.5
4
+ version: 3.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-09-19 00:00:00.000000000 Z
11
+ date: 2018-03-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -169,7 +169,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
169
169
  version: '0'
170
170
  requirements: []
171
171
  rubyforge_project:
172
- rubygems_version: 2.6.12
172
+ rubygems_version: 2.6.14
173
173
  signing_key:
174
174
  specification_version: 4
175
175
  summary: A multilingual tokenizer