pragmatic_tokenizer 3.0.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +45 -31
- data/lib/pragmatic_tokenizer/languages/common.rb +16 -5
- data/lib/pragmatic_tokenizer/languages/english.rb +18 -5
- data/lib/pragmatic_tokenizer/languages/french.rb +10 -4
- data/lib/pragmatic_tokenizer/post_processor.rb +9 -7
- data/lib/pragmatic_tokenizer/pre_processor.rb +7 -7
- data/lib/pragmatic_tokenizer/tokenizer.rb +1 -1
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +2 -3
- data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +0 -31
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2f5112bf38a65d6cfc437fd9b735a5011479807d
|
4
|
+
data.tar.gz: 3963cf07246508f250bc2391fccf4245f1d275b0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fc9fe3c9b9c6aca7ca355ac6a1000f973cdc4f1bd94babd27bad78add4b761cc030bbeddac8ad9deea3ab9e0c4849efa918f2808de198c288fecb7d58af0a4df
|
7
|
+
data.tar.gz: 4c28cb848d3ad5b7f29284b2f0a2d63bc9860a4d9c306d5f173af3f2c98f54241e5bf6a190380dd15e431f792860303aceea58460c58032a2579cc07b2df8cba
|
@@ -4,46 +4,60 @@ module PragmaticTokenizer
|
|
4
4
|
# This class separates true full stops while ignoring
|
5
5
|
# periods that are part of an abbreviation
|
6
6
|
class FullStopSeparator
|
7
|
-
|
7
|
+
|
8
|
+
REGEXP_ENDS_WITH_DOT = /\A(.+)\.\z/
|
9
|
+
REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
|
10
|
+
REGEXP_UNKNOWN1 = /[a-z](?:\.[a-z])+\z/i
|
11
|
+
REGEXP_UNKNOWN2 = /\A(.*\w)\.\z/
|
12
|
+
DOT = '.'.freeze
|
13
|
+
|
8
14
|
def initialize(tokens:, abbreviations:, downcase:)
|
9
|
-
@tokens
|
15
|
+
@tokens = tokens
|
10
16
|
@abbreviations = abbreviations
|
11
|
-
@downcase
|
17
|
+
@downcase = downcase
|
12
18
|
end
|
13
19
|
|
14
20
|
def separate
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
cleaned_tokens << '.'
|
32
|
-
next
|
21
|
+
create_cleaned_tokens
|
22
|
+
replace_last_token unless @cleaned_tokens.empty?
|
23
|
+
@cleaned_tokens
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def create_cleaned_tokens
|
29
|
+
@cleaned_tokens = []
|
30
|
+
@tokens.each_with_index do |token, position|
|
31
|
+
if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
|
32
|
+
match = Regexp.last_match(1)
|
33
|
+
if unknown_method1(match)
|
34
|
+
@cleaned_tokens += [match, DOT]
|
35
|
+
next
|
36
|
+
end
|
33
37
|
end
|
38
|
+
@cleaned_tokens << token
|
34
39
|
end
|
35
|
-
cleaned_tokens << tokens[i]
|
36
40
|
end
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
+
|
42
|
+
def unknown_method1(token)
|
43
|
+
!abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_UNKNOWN1
|
44
|
+
end
|
45
|
+
|
46
|
+
def abbreviation?(token)
|
47
|
+
@abbreviations.include?(inverse_case(token))
|
41
48
|
end
|
42
|
-
|
43
|
-
|
44
|
-
|
49
|
+
|
50
|
+
def inverse_case(token)
|
51
|
+
@downcase ? token : Unicode.downcase(token)
|
45
52
|
end
|
46
|
-
|
47
|
-
|
53
|
+
|
54
|
+
def replace_last_token
|
55
|
+
last_token = @cleaned_tokens[-1]
|
56
|
+
return if abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_UNKNOWN2
|
57
|
+
@cleaned_tokens[-1] = Regexp.last_match(1)
|
58
|
+
@cleaned_tokens << DOT
|
59
|
+
end
|
60
|
+
|
48
61
|
end
|
62
|
+
|
49
63
|
end
|
@@ -15,14 +15,25 @@ module PragmaticTokenizer
|
|
15
15
|
EMOTICON_REGEX = /(?::|;|=)(?:-)?(?:\)|D|P)/
|
16
16
|
|
17
17
|
class SingleQuotes
|
18
|
+
|
19
|
+
REGEXP_LEFT_QUOTES1 = /(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o
|
20
|
+
REGEXP_LEFT_QUOTES2 = /(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o
|
21
|
+
REGEXP_LEFT_QUOTES3 = /(\W|^)'(?=.*\w)/o
|
22
|
+
REGEXP_RIGHT_SIDE_QUOTES = /(\w|\D)'(?!')(?=\W|$)/o
|
23
|
+
|
18
24
|
def handle_single_quotes(text)
|
19
25
|
# Convert left quotes to special character except for 'Twas or 'twas
|
20
|
-
|
21
|
-
text.gsub!(
|
22
|
-
text.gsub!(
|
23
|
-
|
24
|
-
|
26
|
+
replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
|
27
|
+
text.gsub!(REGEXP_LEFT_QUOTES1, "\\1 #{replacement} ")
|
28
|
+
text.gsub!(REGEXP_LEFT_QUOTES3, ' ' << replacement)
|
29
|
+
text.gsub!(REGEXP_RIGHT_SIDE_QUOTES, "\\1 #{replacement} ")
|
30
|
+
|
31
|
+
replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘".freeze]
|
32
|
+
text.gsub!(REGEXP_LEFT_QUOTES2, "\\1 #{replacement} ")
|
33
|
+
|
34
|
+
text
|
25
35
|
end
|
36
|
+
|
26
37
|
end
|
27
38
|
end
|
28
39
|
end
|
@@ -95,16 +95,29 @@ module PragmaticTokenizer
|
|
95
95
|
"will-o'-the-wisp" => "will-of-the-wisp",
|
96
96
|
"'twas" => "it was"
|
97
97
|
}.freeze
|
98
|
+
|
98
99
|
class SingleQuotes
|
100
|
+
|
101
|
+
REGEXP_LEFT_QUOTES1 = /(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o
|
102
|
+
REGEXP_LEFT_QUOTES2 = /(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o
|
103
|
+
REGEXP_LEFT_QUOTES3 = /(\W|^)'(?=.*\w)/o
|
104
|
+
REGEXP_RIGHT_SIDE_QUOTES = /(\w|\D)'(?!')(?=\W|$)/o
|
105
|
+
|
99
106
|
def handle_single_quotes(text)
|
100
107
|
# Convert left quotes to special character except for 'Twas or 'twas
|
101
|
-
|
102
|
-
text.gsub!(
|
103
|
-
text.gsub!(
|
104
|
-
|
105
|
-
|
108
|
+
replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
|
109
|
+
text.gsub!(REGEXP_LEFT_QUOTES1, "\\1 #{replacement} ")
|
110
|
+
text.gsub!(REGEXP_LEFT_QUOTES3, ' ' << replacement)
|
111
|
+
text.gsub!(REGEXP_RIGHT_SIDE_QUOTES, "\\1 #{replacement} ")
|
112
|
+
|
113
|
+
replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘".freeze]
|
114
|
+
text.gsub!(REGEXP_LEFT_QUOTES2, "\\1 #{replacement} ")
|
115
|
+
|
116
|
+
text
|
106
117
|
end
|
118
|
+
|
107
119
|
end
|
120
|
+
|
108
121
|
end
|
109
122
|
end
|
110
123
|
end
|
@@ -7,11 +7,17 @@ module PragmaticTokenizer
|
|
7
7
|
CONTRACTIONS = {}.freeze
|
8
8
|
|
9
9
|
class SingleQuotes
|
10
|
+
|
11
|
+
REGEXP_UNKNOWN1 = /(\w|\D)'(?!')(?=\W|$)/o
|
12
|
+
REGEXP_UNKNOWN2 = /(\W|^)'(?=.*\w)/o
|
13
|
+
|
10
14
|
def handle_single_quotes(text)
|
11
|
-
|
12
|
-
text.gsub!(
|
13
|
-
text.gsub!(
|
14
|
-
text.gsub!(/
|
15
|
+
replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
|
16
|
+
text.gsub!(REGEXP_UNKNOWN1, "\\1 #{replacement} ")
|
17
|
+
text.gsub!(REGEXP_UNKNOWN2, ' ' << replacement)
|
18
|
+
text.gsub!(/l\'/, '\1 l☮ \2')
|
19
|
+
text.gsub!(/L\'/, '\1 L☮ \2')
|
20
|
+
text
|
15
21
|
end
|
16
22
|
end
|
17
23
|
end
|
@@ -20,6 +20,7 @@ module PragmaticTokenizer
|
|
20
20
|
|
21
21
|
REGEX_UNIFIED2 = Regexp.union(REGEXP_SINGLE_QUOTES,
|
22
22
|
REGEXP_COMMAS)
|
23
|
+
REGEXP_UNKNOWN1 = /(?<=\S)([。.!!??]+)$/
|
23
24
|
|
24
25
|
attr_reader :text, :abbreviations, :downcase
|
25
26
|
|
@@ -30,17 +31,21 @@ module PragmaticTokenizer
|
|
30
31
|
end
|
31
32
|
|
32
33
|
def post_process
|
33
|
-
|
34
|
+
separate_ending_punctuation(method_name3)
|
34
35
|
end
|
35
36
|
|
36
37
|
private
|
37
38
|
|
38
39
|
def method_name3
|
39
|
-
separated =
|
40
|
+
separated = separate_ending_punctuation(full_stop_separated_tokens)
|
40
41
|
procs = [unified1, split_unknown_period1, split_unknown_period2, split_emoji]
|
41
42
|
procs.reduce(separated) { |a, e| a.flat_map(&e) }
|
42
43
|
end
|
43
44
|
|
45
|
+
def separate_ending_punctuation(tokens)
|
46
|
+
tokens.flat_map { |token| token.split(REGEXP_UNKNOWN1) }
|
47
|
+
end
|
48
|
+
|
44
49
|
def unified1
|
45
50
|
proc { |token| token.split(REGEX_UNIFIED1) }
|
46
51
|
end
|
@@ -91,11 +96,8 @@ module PragmaticTokenizer
|
|
91
96
|
end
|
92
97
|
|
93
98
|
def extract_abbreviation(token)
|
94
|
-
|
95
|
-
|
96
|
-
else
|
97
|
-
Unicode.downcase(token.split(/(\.)/)[0])
|
98
|
-
end
|
99
|
+
before_first_dot = token[0, token.index('.'.freeze)]
|
100
|
+
downcase ? before_first_dot : Unicode.downcase(before_first_dot)
|
99
101
|
end
|
100
102
|
|
101
103
|
def convert_sym_to_punct(token)
|
@@ -59,15 +59,15 @@ module PragmaticTokenizer
|
|
59
59
|
end
|
60
60
|
|
61
61
|
def shift_horizontal_ellipsis!
|
62
|
-
gsub!(/(…+)/o
|
62
|
+
gsub!(/(…+)/o, ' \1 ')
|
63
63
|
end
|
64
64
|
|
65
65
|
def shift_ellipse_two_dots!
|
66
|
-
gsub!(/(\.\.+)/o
|
66
|
+
gsub!(/(\.\.+)/o, ' \1 ')
|
67
67
|
end
|
68
68
|
|
69
69
|
def shift_ellipse_three_dots!
|
70
|
-
gsub!(/(\.\.\.+)/o
|
70
|
+
gsub!(/(\.\.\.+)/o, ' \1 ')
|
71
71
|
end
|
72
72
|
|
73
73
|
def shift_no_space_mention!
|
@@ -98,11 +98,11 @@ module PragmaticTokenizer
|
|
98
98
|
end
|
99
99
|
|
100
100
|
def shift_bracket!
|
101
|
-
gsub!(/([\(\[\{\}\]\)])/o
|
101
|
+
gsub!(/([\(\[\{\}\]\)])/o, ' \1 ')
|
102
102
|
end
|
103
103
|
|
104
104
|
def shift_semicolon!
|
105
|
-
gsub!(/([;])/o
|
105
|
+
gsub!(/([;])/o, ' \1 ')
|
106
106
|
end
|
107
107
|
|
108
108
|
def shift_percent!
|
@@ -138,7 +138,7 @@ module PragmaticTokenizer
|
|
138
138
|
|
139
139
|
def replace_left_quotes!(style, replacement_key)
|
140
140
|
replacement = replacement_for_key(replacement_key)
|
141
|
-
gsub!(/#{style}(?=.*\w)/o, ' '
|
141
|
+
gsub!(/#{style}(?=.*\w)/o, ' ' << replacement << ' ')
|
142
142
|
end
|
143
143
|
|
144
144
|
def replace_remaining_double_quotes!
|
@@ -149,7 +149,7 @@ module PragmaticTokenizer
|
|
149
149
|
|
150
150
|
def replace_remaining_quotes!(style, replacement_key)
|
151
151
|
replacement = replacement_for_key(replacement_key)
|
152
|
-
gsub!(/#{style}/, ' '
|
152
|
+
gsub!(/#{style}/, ' ' << replacement << ' ')
|
153
153
|
end
|
154
154
|
|
155
155
|
def convert_sgl_quotes!(language)
|
@@ -3,8 +3,8 @@ require 'pragmatic_tokenizer/languages'
|
|
3
3
|
require 'pragmatic_tokenizer/pre_processor'
|
4
4
|
require 'pragmatic_tokenizer/post_processor'
|
5
5
|
require 'pragmatic_tokenizer/full_stop_separator'
|
6
|
-
require 'pragmatic_tokenizer/ending_punctuation_separator'
|
7
6
|
require 'unicode'
|
7
|
+
require 'set'
|
8
8
|
|
9
9
|
module PragmaticTokenizer
|
10
10
|
class Tokenizer
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -111,7 +111,6 @@ files:
|
|
111
111
|
- README.md
|
112
112
|
- Rakefile
|
113
113
|
- lib/pragmatic_tokenizer.rb
|
114
|
-
- lib/pragmatic_tokenizer/ending_punctuation_separator.rb
|
115
114
|
- lib/pragmatic_tokenizer/full_stop_separator.rb
|
116
115
|
- lib/pragmatic_tokenizer/languages.rb
|
117
116
|
- lib/pragmatic_tokenizer/languages/arabic.rb
|
@@ -1,31 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module PragmaticTokenizer
|
4
|
-
# This class separates ending punctuation from a token
|
5
|
-
class EndingPunctuationSeparator
|
6
|
-
attr_reader :tokens
|
7
|
-
def initialize(tokens:)
|
8
|
-
@tokens = tokens
|
9
|
-
end
|
10
|
-
|
11
|
-
def separate
|
12
|
-
cleaned_tokens = []
|
13
|
-
tokens.each do |a|
|
14
|
-
split_punctuation = a.scan(/(?<=\S)[。.!!??]+$/)
|
15
|
-
if split_punctuation[0].nil?
|
16
|
-
cleaned_tokens << a
|
17
|
-
else
|
18
|
-
cleaned_tokens << a.tr(split_punctuation[0], '')
|
19
|
-
if split_punctuation[0].length.eql?(1)
|
20
|
-
cleaned_tokens << split_punctuation[0]
|
21
|
-
else
|
22
|
-
split_punctuation[0].split("").each do |s|
|
23
|
-
cleaned_tokens << s
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
cleaned_tokens
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|