pragmatic_tokenizer 3.0.5 → 3.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 746fe8bd11bb0bd75cd7553a7f52d37810a3962f
|
4
|
+
data.tar.gz: 2ec1b073ec014f15a7820297cfdaa46457b94130
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e0a368fe63c7fd4b6f2d0f5636abd922797ff0cc84cd41fdd728803f245d5380e746a3dba02daa585dbe26c7ac84f11f94ac18cdec928dbfe8560a1a45c833d9
|
7
|
+
data.tar.gz: 178b2cc47e431cbc6c11ddd4fecd55394dc5498cd98651c4a632f1c923b2fd2ca73ed71c65353f9d33662141b51b51b5e4d03db51dade1876d5de1c16781359f
|
@@ -17,7 +17,7 @@ module PragmaticTokenizer
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def separate
|
20
|
-
create_cleaned_tokens
|
20
|
+
@cleaned_tokens = create_cleaned_tokens
|
21
21
|
replace_last_token unless @cleaned_tokens.empty?
|
22
22
|
@cleaned_tokens
|
23
23
|
end
|
@@ -25,21 +25,15 @@ module PragmaticTokenizer
|
|
25
25
|
private
|
26
26
|
|
27
27
|
def create_cleaned_tokens
|
28
|
-
@
|
29
|
-
|
30
|
-
|
31
|
-
match = Regexp.last_match(1)
|
32
|
-
if abbreviation?(match)
|
33
|
-
@cleaned_tokens += [match, DOT]
|
34
|
-
next
|
35
|
-
end
|
36
|
-
end
|
37
|
-
@cleaned_tokens << token
|
38
|
-
end
|
28
|
+
@tokens[0..-2]
|
29
|
+
.flat_map { |token| abbreviation?(token) ? [token[0..-2], DOT] : token }
|
30
|
+
.push(@tokens.last)
|
39
31
|
end
|
40
32
|
|
41
33
|
def abbreviation?(token)
|
42
|
-
|
34
|
+
return false unless token.end_with?(DOT) && token.length > 1
|
35
|
+
shortened = token.chomp(DOT)
|
36
|
+
!defined_abbreviation?(shortened) && shortened !~ REGEXP_ONLY_LETTERS && shortened !~ REGEXP_ABBREVIATION
|
43
37
|
end
|
44
38
|
|
45
39
|
def defined_abbreviation?(token)
|
@@ -52,7 +46,9 @@ module PragmaticTokenizer
|
|
52
46
|
|
53
47
|
def replace_last_token
|
54
48
|
last_token = @cleaned_tokens[-1]
|
55
|
-
return
|
49
|
+
return unless last_token.end_with?(DOT) && last_token.length > 1
|
50
|
+
shortened = last_token.chomp(DOT)
|
51
|
+
return if defined_abbreviation?(shortened) || last_token !~ REGEXP_ENDS_WITH_DOT
|
56
52
|
@cleaned_tokens[-1] = Regexp.last_match(1)
|
57
53
|
@cleaned_tokens << DOT
|
58
54
|
end
|
@@ -1,26 +1,43 @@
|
|
1
1
|
module PragmaticTokenizer
|
2
2
|
class PostProcessor
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
4
|
+
DOT = '.'.freeze
|
5
|
+
RANGE_DINGBATS = '[\u2701-\u27BE]'.freeze # e.g. ✁✎✳❄➾
|
6
|
+
RANGE_VARIATION_SELECTORS = '[\uFE00-\uFE0F]'.freeze # alter the previous character
|
7
|
+
RANGE_FULLWIDTH = '[\uFF01-\ufF1F]'.freeze # e.g. !"#'?
|
8
|
+
|
9
|
+
REGEXP_COMMAS = /^([,‚])+/
|
10
|
+
REGEXP_SINGLE_QUOTES = /(.+)([’'‘`])$/
|
11
|
+
REGEXP_SLASH = /^(?!(https?:|www\.))(.*)\//
|
12
|
+
REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)/
|
9
13
|
REGEXP_PLUS_SIGN = /(.+)\+(.+)/
|
10
|
-
REGEXP_COLON = /^(
|
11
|
-
|
14
|
+
REGEXP_COLON = /^(:)(\S{2,})/
|
15
|
+
REGEXP_DINGBATS = /(#{RANGE_DINGBATS}#{RANGE_VARIATION_SELECTORS}*)/
|
16
|
+
REGEXP_ENDING_PUNCT = /(?<=\S)([#{RANGE_FULLWIDTH}!?]+)$/
|
17
|
+
REGEXP_DOMAIN = /^((https?:\/\/|)?[a-z0-9]+([\-\.][a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?)$/ix
|
18
|
+
REGEXP_EMAIL = /\S+[@@]\S+/
|
19
|
+
REGEXP_DOMAIN_START = /^(https?:|www\.|[[:alpha:]]\.)/
|
20
|
+
REGEXP_DOMAIN_END = /\.(com|net|org|edu|gov|mil|int|[[:alpha:]]{2})$/
|
21
|
+
REGEXP_DIGIT = /[[:digit:]]+/
|
22
|
+
REGEXP_PERIOD1 = /(.*\.)/
|
23
|
+
REGEXP_PERIOD2 = /(\.)/
|
12
24
|
|
13
25
|
REGEX_UNIFIED1 = Regexp.union(REGEXP_SLASH,
|
14
26
|
REGEXP_QUESTION_MARK,
|
15
27
|
REGEXP_PLUS_SIGN,
|
16
28
|
REGEXP_COLON,
|
17
|
-
|
29
|
+
REGEXP_DINGBATS,
|
18
30
|
PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX,
|
19
31
|
PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX)
|
20
32
|
|
21
33
|
REGEX_UNIFIED2 = Regexp.union(REGEXP_SINGLE_QUOTES,
|
22
34
|
REGEXP_COMMAS)
|
23
|
-
|
35
|
+
|
36
|
+
REGEX_DOMAIN_EMAIL = Regexp.union(REGEXP_DOMAIN,
|
37
|
+
REGEXP_EMAIL)
|
38
|
+
|
39
|
+
REGEX_DOMAIN = Regexp.union(REGEXP_DOMAIN_START,
|
40
|
+
REGEXP_DOMAIN_END)
|
24
41
|
|
25
42
|
attr_reader :text, :abbreviations, :downcase
|
26
43
|
|
@@ -31,19 +48,24 @@ module PragmaticTokenizer
|
|
31
48
|
end
|
32
49
|
|
33
50
|
def post_process
|
34
|
-
|
51
|
+
procs.reduce(full_stop_separated_tokens) { |a, e| a.flat_map(&e) }
|
35
52
|
end
|
36
53
|
|
37
54
|
private
|
38
55
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
56
|
+
# note: we need to run #separate_ending_punctuation twice. maybe there's a better solution?
|
57
|
+
def procs
|
58
|
+
[
|
59
|
+
separate_ending_punctuation,
|
60
|
+
unified1,
|
61
|
+
split_unknown_period1,
|
62
|
+
split_unknown_period2,
|
63
|
+
separate_ending_punctuation
|
64
|
+
]
|
43
65
|
end
|
44
66
|
|
45
|
-
def separate_ending_punctuation
|
46
|
-
|
67
|
+
def separate_ending_punctuation
|
68
|
+
proc { |token| token.split(REGEXP_ENDING_PUNCT) }
|
47
69
|
end
|
48
70
|
|
49
71
|
def unified1
|
@@ -51,64 +73,48 @@ module PragmaticTokenizer
|
|
51
73
|
end
|
52
74
|
|
53
75
|
def full_stop_separated_tokens
|
54
|
-
FullStopSeparator.new(tokens:
|
76
|
+
FullStopSeparator.new(tokens: split_convert_commas_quotes, abbreviations: abbreviations, downcase: downcase).separate
|
55
77
|
end
|
56
78
|
|
57
|
-
def
|
79
|
+
def split_convert_commas_quotes
|
58
80
|
text
|
59
81
|
.split
|
60
82
|
.flat_map { |token| token.split(REGEX_UNIFIED2) }
|
61
83
|
.flat_map { |token| convert_sym_to_punct(token) }
|
62
84
|
end
|
63
85
|
|
64
|
-
def split_emoji
|
65
|
-
proc { |token| (token =~ /(\A|\S)\u{2744}[^\u{FE0E}|\u{FE0F}]/) ? token.split(/(\u{2744})/) : token }
|
66
|
-
end
|
67
|
-
|
68
86
|
def split_unknown_period1
|
69
|
-
proc { |token| unknown_period1?(token) ? token.split(
|
87
|
+
proc { |token| unknown_period1?(token) ? token.split(REGEXP_PERIOD1) : token }
|
70
88
|
end
|
71
89
|
|
72
90
|
def split_unknown_period2
|
73
|
-
proc { |token| unknown_period2?(token) ? token.split(
|
91
|
+
proc { |token| unknown_period2?(token) ? token.split(REGEXP_PERIOD2) : token }
|
74
92
|
end
|
75
93
|
|
76
94
|
def unknown_period1?(token)
|
77
|
-
token.include?(
|
78
|
-
token !~ /(http|https|www)(\.|:)/ &&
|
95
|
+
token.include?(DOT) &&
|
79
96
|
token.length > 1 &&
|
80
|
-
token !~
|
81
|
-
token !~ /\S+(@|@)\S+/ &&
|
97
|
+
token !~ REGEX_DOMAIN_EMAIL &&
|
82
98
|
abbreviations.include?(extract_abbreviation(token))
|
83
99
|
end
|
84
100
|
|
85
101
|
def unknown_period2?(token)
|
86
|
-
token.include?(
|
87
|
-
token !~
|
88
|
-
token !~
|
89
|
-
token
|
90
|
-
token
|
91
|
-
token !~ /\A[a-zA-Z]{1}\./ &&
|
92
|
-
token.count(".") == 1 &&
|
93
|
-
token !~ /\d+/ &&
|
94
|
-
!abbreviations.include?(extract_abbreviation(token)) &&
|
95
|
-
token !~ /\S+(@|@)\S+/
|
102
|
+
token.include?(DOT) &&
|
103
|
+
token !~ REGEX_DOMAIN &&
|
104
|
+
token !~ REGEXP_DIGIT &&
|
105
|
+
token.count(DOT) == 1 &&
|
106
|
+
!abbreviations.include?(extract_abbreviation(token))
|
96
107
|
end
|
97
108
|
|
98
109
|
def extract_abbreviation(token)
|
99
|
-
before_first_dot = token[0, token.index(
|
110
|
+
before_first_dot = token[0, token.index(DOT)]
|
100
111
|
downcase ? before_first_dot : Unicode.downcase(before_first_dot)
|
101
112
|
end
|
102
113
|
|
103
114
|
def convert_sym_to_punct(token)
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
else
|
108
|
-
pattern = symbol_matches[0]
|
109
|
-
replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(pattern)
|
110
|
-
token.gsub!(pattern, replacement)
|
111
|
-
end
|
115
|
+
PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP
|
116
|
+
.each { |pattern, replacement| break if token.sub!(replacement, pattern) }
|
117
|
+
token
|
112
118
|
end
|
113
119
|
|
114
120
|
end
|
@@ -64,7 +64,7 @@ module PragmaticTokenizer
|
|
64
64
|
REGEXP_NO_NUMBERS = /\A\D+\z/
|
65
65
|
REGEXP_NUMBER = /\D*\d+\d*/
|
66
66
|
REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
|
67
|
-
REGEXP_CHUNK_STRING =
|
67
|
+
REGEXP_CHUNK_STRING = /\S.{1,10000}(?!\S)/m
|
68
68
|
|
69
69
|
# @param [Hash] opts optional arguments
|
70
70
|
|
@@ -150,7 +150,7 @@ module PragmaticTokenizer
|
|
150
150
|
|
151
151
|
def tokenize(text)
|
152
152
|
return [] unless text
|
153
|
-
raise "In
|
153
|
+
raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
|
154
154
|
CGI.unescapeHTML(text)
|
155
155
|
.scan(REGEXP_CHUNK_STRING)
|
156
156
|
.flat_map { |segment| post_process(pre_process(segment)) }
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-03-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -169,7 +169,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
169
169
|
version: '0'
|
170
170
|
requirements: []
|
171
171
|
rubyforge_project:
|
172
|
-
rubygems_version: 2.6.
|
172
|
+
rubygems_version: 2.6.14
|
173
173
|
signing_key:
|
174
174
|
specification_version: 4
|
175
175
|
summary: A multilingual tokenizer
|