pragmatic_tokenizer 3.0.5 → 3.0.6
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 746fe8bd11bb0bd75cd7553a7f52d37810a3962f
|
4
|
+
data.tar.gz: 2ec1b073ec014f15a7820297cfdaa46457b94130
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e0a368fe63c7fd4b6f2d0f5636abd922797ff0cc84cd41fdd728803f245d5380e746a3dba02daa585dbe26c7ac84f11f94ac18cdec928dbfe8560a1a45c833d9
|
7
|
+
data.tar.gz: 178b2cc47e431cbc6c11ddd4fecd55394dc5498cd98651c4a632f1c923b2fd2ca73ed71c65353f9d33662141b51b51b5e4d03db51dade1876d5de1c16781359f
|
@@ -17,7 +17,7 @@ module PragmaticTokenizer
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def separate
|
20
|
-
create_cleaned_tokens
|
20
|
+
@cleaned_tokens = create_cleaned_tokens
|
21
21
|
replace_last_token unless @cleaned_tokens.empty?
|
22
22
|
@cleaned_tokens
|
23
23
|
end
|
@@ -25,21 +25,15 @@ module PragmaticTokenizer
|
|
25
25
|
private
|
26
26
|
|
27
27
|
def create_cleaned_tokens
|
28
|
-
@
|
29
|
-
|
30
|
-
|
31
|
-
match = Regexp.last_match(1)
|
32
|
-
if abbreviation?(match)
|
33
|
-
@cleaned_tokens += [match, DOT]
|
34
|
-
next
|
35
|
-
end
|
36
|
-
end
|
37
|
-
@cleaned_tokens << token
|
38
|
-
end
|
28
|
+
@tokens[0..-2]
|
29
|
+
.flat_map { |token| abbreviation?(token) ? [token[0..-2], DOT] : token }
|
30
|
+
.push(@tokens.last)
|
39
31
|
end
|
40
32
|
|
41
33
|
def abbreviation?(token)
|
42
|
-
|
34
|
+
return false unless token.end_with?(DOT) && token.length > 1
|
35
|
+
shortened = token.chomp(DOT)
|
36
|
+
!defined_abbreviation?(shortened) && shortened !~ REGEXP_ONLY_LETTERS && shortened !~ REGEXP_ABBREVIATION
|
43
37
|
end
|
44
38
|
|
45
39
|
def defined_abbreviation?(token)
|
@@ -52,7 +46,9 @@ module PragmaticTokenizer
|
|
52
46
|
|
53
47
|
def replace_last_token
|
54
48
|
last_token = @cleaned_tokens[-1]
|
55
|
-
return
|
49
|
+
return unless last_token.end_with?(DOT) && last_token.length > 1
|
50
|
+
shortened = last_token.chomp(DOT)
|
51
|
+
return if defined_abbreviation?(shortened) || last_token !~ REGEXP_ENDS_WITH_DOT
|
56
52
|
@cleaned_tokens[-1] = Regexp.last_match(1)
|
57
53
|
@cleaned_tokens << DOT
|
58
54
|
end
|
@@ -1,26 +1,43 @@
|
|
1
1
|
module PragmaticTokenizer
|
2
2
|
class PostProcessor
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
4
|
+
DOT = '.'.freeze
|
5
|
+
RANGE_DINGBATS = '[\u2701-\u27BE]'.freeze # e.g. ✁✎✳❄➾
|
6
|
+
RANGE_VARIATION_SELECTORS = '[\uFE00-\uFE0F]'.freeze # alter the previous character
|
7
|
+
RANGE_FULLWIDTH = '[\uFF01-\ufF1F]'.freeze # e.g. !"#'?
|
8
|
+
|
9
|
+
REGEXP_COMMAS = /^([,‚])+/
|
10
|
+
REGEXP_SINGLE_QUOTES = /(.+)([’'‘`])$/
|
11
|
+
REGEXP_SLASH = /^(?!(https?:|www\.))(.*)\//
|
12
|
+
REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)/
|
9
13
|
REGEXP_PLUS_SIGN = /(.+)\+(.+)/
|
10
|
-
REGEXP_COLON = /^(
|
11
|
-
|
14
|
+
REGEXP_COLON = /^(:)(\S{2,})/
|
15
|
+
REGEXP_DINGBATS = /(#{RANGE_DINGBATS}#{RANGE_VARIATION_SELECTORS}*)/
|
16
|
+
REGEXP_ENDING_PUNCT = /(?<=\S)([#{RANGE_FULLWIDTH}!?]+)$/
|
17
|
+
REGEXP_DOMAIN = /^((https?:\/\/|)?[a-z0-9]+([\-\.][a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?)$/ix
|
18
|
+
REGEXP_EMAIL = /\S+[@@]\S+/
|
19
|
+
REGEXP_DOMAIN_START = /^(https?:|www\.|[[:alpha:]]\.)/
|
20
|
+
REGEXP_DOMAIN_END = /\.(com|net|org|edu|gov|mil|int|[[:alpha:]]{2})$/
|
21
|
+
REGEXP_DIGIT = /[[:digit:]]+/
|
22
|
+
REGEXP_PERIOD1 = /(.*\.)/
|
23
|
+
REGEXP_PERIOD2 = /(\.)/
|
12
24
|
|
13
25
|
REGEX_UNIFIED1 = Regexp.union(REGEXP_SLASH,
|
14
26
|
REGEXP_QUESTION_MARK,
|
15
27
|
REGEXP_PLUS_SIGN,
|
16
28
|
REGEXP_COLON,
|
17
|
-
|
29
|
+
REGEXP_DINGBATS,
|
18
30
|
PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX,
|
19
31
|
PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX)
|
20
32
|
|
21
33
|
REGEX_UNIFIED2 = Regexp.union(REGEXP_SINGLE_QUOTES,
|
22
34
|
REGEXP_COMMAS)
|
23
|
-
|
35
|
+
|
36
|
+
REGEX_DOMAIN_EMAIL = Regexp.union(REGEXP_DOMAIN,
|
37
|
+
REGEXP_EMAIL)
|
38
|
+
|
39
|
+
REGEX_DOMAIN = Regexp.union(REGEXP_DOMAIN_START,
|
40
|
+
REGEXP_DOMAIN_END)
|
24
41
|
|
25
42
|
attr_reader :text, :abbreviations, :downcase
|
26
43
|
|
@@ -31,19 +48,24 @@ module PragmaticTokenizer
|
|
31
48
|
end
|
32
49
|
|
33
50
|
def post_process
|
34
|
-
|
51
|
+
procs.reduce(full_stop_separated_tokens) { |a, e| a.flat_map(&e) }
|
35
52
|
end
|
36
53
|
|
37
54
|
private
|
38
55
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
56
|
+
# note: we need to run #separate_ending_punctuation twice. maybe there's a better solution?
|
57
|
+
def procs
|
58
|
+
[
|
59
|
+
separate_ending_punctuation,
|
60
|
+
unified1,
|
61
|
+
split_unknown_period1,
|
62
|
+
split_unknown_period2,
|
63
|
+
separate_ending_punctuation
|
64
|
+
]
|
43
65
|
end
|
44
66
|
|
45
|
-
def separate_ending_punctuation
|
46
|
-
|
67
|
+
def separate_ending_punctuation
|
68
|
+
proc { |token| token.split(REGEXP_ENDING_PUNCT) }
|
47
69
|
end
|
48
70
|
|
49
71
|
def unified1
|
@@ -51,64 +73,48 @@ module PragmaticTokenizer
|
|
51
73
|
end
|
52
74
|
|
53
75
|
def full_stop_separated_tokens
|
54
|
-
FullStopSeparator.new(tokens:
|
76
|
+
FullStopSeparator.new(tokens: split_convert_commas_quotes, abbreviations: abbreviations, downcase: downcase).separate
|
55
77
|
end
|
56
78
|
|
57
|
-
def
|
79
|
+
def split_convert_commas_quotes
|
58
80
|
text
|
59
81
|
.split
|
60
82
|
.flat_map { |token| token.split(REGEX_UNIFIED2) }
|
61
83
|
.flat_map { |token| convert_sym_to_punct(token) }
|
62
84
|
end
|
63
85
|
|
64
|
-
def split_emoji
|
65
|
-
proc { |token| (token =~ /(\A|\S)\u{2744}[^\u{FE0E}|\u{FE0F}]/) ? token.split(/(\u{2744})/) : token }
|
66
|
-
end
|
67
|
-
|
68
86
|
def split_unknown_period1
|
69
|
-
proc { |token| unknown_period1?(token) ? token.split(
|
87
|
+
proc { |token| unknown_period1?(token) ? token.split(REGEXP_PERIOD1) : token }
|
70
88
|
end
|
71
89
|
|
72
90
|
def split_unknown_period2
|
73
|
-
proc { |token| unknown_period2?(token) ? token.split(
|
91
|
+
proc { |token| unknown_period2?(token) ? token.split(REGEXP_PERIOD2) : token }
|
74
92
|
end
|
75
93
|
|
76
94
|
def unknown_period1?(token)
|
77
|
-
token.include?(
|
78
|
-
token !~ /(http|https|www)(\.|:)/ &&
|
95
|
+
token.include?(DOT) &&
|
79
96
|
token.length > 1 &&
|
80
|
-
token !~
|
81
|
-
token !~ /\S+(@|@)\S+/ &&
|
97
|
+
token !~ REGEX_DOMAIN_EMAIL &&
|
82
98
|
abbreviations.include?(extract_abbreviation(token))
|
83
99
|
end
|
84
100
|
|
85
101
|
def unknown_period2?(token)
|
86
|
-
token.include?(
|
87
|
-
token !~
|
88
|
-
token !~
|
89
|
-
token
|
90
|
-
token
|
91
|
-
token !~ /\A[a-zA-Z]{1}\./ &&
|
92
|
-
token.count(".") == 1 &&
|
93
|
-
token !~ /\d+/ &&
|
94
|
-
!abbreviations.include?(extract_abbreviation(token)) &&
|
95
|
-
token !~ /\S+(@|@)\S+/
|
102
|
+
token.include?(DOT) &&
|
103
|
+
token !~ REGEX_DOMAIN &&
|
104
|
+
token !~ REGEXP_DIGIT &&
|
105
|
+
token.count(DOT) == 1 &&
|
106
|
+
!abbreviations.include?(extract_abbreviation(token))
|
96
107
|
end
|
97
108
|
|
98
109
|
def extract_abbreviation(token)
|
99
|
-
before_first_dot = token[0, token.index(
|
110
|
+
before_first_dot = token[0, token.index(DOT)]
|
100
111
|
downcase ? before_first_dot : Unicode.downcase(before_first_dot)
|
101
112
|
end
|
102
113
|
|
103
114
|
def convert_sym_to_punct(token)
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
else
|
108
|
-
pattern = symbol_matches[0]
|
109
|
-
replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(pattern)
|
110
|
-
token.gsub!(pattern, replacement)
|
111
|
-
end
|
115
|
+
PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP
|
116
|
+
.each { |pattern, replacement| break if token.sub!(replacement, pattern) }
|
117
|
+
token
|
112
118
|
end
|
113
119
|
|
114
120
|
end
|
@@ -64,7 +64,7 @@ module PragmaticTokenizer
|
|
64
64
|
REGEXP_NO_NUMBERS = /\A\D+\z/
|
65
65
|
REGEXP_NUMBER = /\D*\d+\d*/
|
66
66
|
REGEXP_CONSECUTIVE_DOTS = /\A\.{2,}\z/
|
67
|
-
REGEXP_CHUNK_STRING =
|
67
|
+
REGEXP_CHUNK_STRING = /\S.{1,10000}(?!\S)/m
|
68
68
|
|
69
69
|
# @param [Hash] opts optional arguments
|
70
70
|
|
@@ -150,7 +150,7 @@ module PragmaticTokenizer
|
|
150
150
|
|
151
151
|
def tokenize(text)
|
152
152
|
return [] unless text
|
153
|
-
raise "In
|
153
|
+
raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
|
154
154
|
CGI.unescapeHTML(text)
|
155
155
|
.scan(REGEXP_CHUNK_STRING)
|
156
156
|
.flat_map { |segment| post_process(pre_process(segment)) }
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-03-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -169,7 +169,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
169
169
|
version: '0'
|
170
170
|
requirements: []
|
171
171
|
rubyforge_project:
|
172
|
-
rubygems_version: 2.6.
|
172
|
+
rubygems_version: 2.6.14
|
173
173
|
signing_key:
|
174
174
|
specification_version: 4
|
175
175
|
summary: A multilingual tokenizer
|