pragmatic_tokenizer 3.0.0 → 3.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +45 -31
- data/lib/pragmatic_tokenizer/languages/common.rb +16 -5
- data/lib/pragmatic_tokenizer/languages/english.rb +18 -5
- data/lib/pragmatic_tokenizer/languages/french.rb +10 -4
- data/lib/pragmatic_tokenizer/post_processor.rb +9 -7
- data/lib/pragmatic_tokenizer/pre_processor.rb +7 -7
- data/lib/pragmatic_tokenizer/tokenizer.rb +1 -1
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +2 -3
- data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +0 -31
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2f5112bf38a65d6cfc437fd9b735a5011479807d
|
4
|
+
data.tar.gz: 3963cf07246508f250bc2391fccf4245f1d275b0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fc9fe3c9b9c6aca7ca355ac6a1000f973cdc4f1bd94babd27bad78add4b761cc030bbeddac8ad9deea3ab9e0c4849efa918f2808de198c288fecb7d58af0a4df
|
7
|
+
data.tar.gz: 4c28cb848d3ad5b7f29284b2f0a2d63bc9860a4d9c306d5f173af3f2c98f54241e5bf6a190380dd15e431f792860303aceea58460c58032a2579cc07b2df8cba
|
@@ -4,46 +4,60 @@ module PragmaticTokenizer
|
|
4
4
|
# This class separates true full stops while ignoring
|
5
5
|
# periods that are part of an abbreviation
|
6
6
|
class FullStopSeparator
|
7
|
-
|
7
|
+
|
8
|
+
REGEXP_ENDS_WITH_DOT = /\A(.+)\.\z/
|
9
|
+
REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
|
10
|
+
REGEXP_UNKNOWN1 = /[a-z](?:\.[a-z])+\z/i
|
11
|
+
REGEXP_UNKNOWN2 = /\A(.*\w)\.\z/
|
12
|
+
DOT = '.'.freeze
|
13
|
+
|
8
14
|
def initialize(tokens:, abbreviations:, downcase:)
|
9
|
-
@tokens
|
15
|
+
@tokens = tokens
|
10
16
|
@abbreviations = abbreviations
|
11
|
-
@downcase
|
17
|
+
@downcase = downcase
|
12
18
|
end
|
13
19
|
|
14
20
|
def separate
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
cleaned_tokens << '.'
|
32
|
-
next
|
21
|
+
create_cleaned_tokens
|
22
|
+
replace_last_token unless @cleaned_tokens.empty?
|
23
|
+
@cleaned_tokens
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def create_cleaned_tokens
|
29
|
+
@cleaned_tokens = []
|
30
|
+
@tokens.each_with_index do |token, position|
|
31
|
+
if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
|
32
|
+
match = Regexp.last_match(1)
|
33
|
+
if unknown_method1(match)
|
34
|
+
@cleaned_tokens += [match, DOT]
|
35
|
+
next
|
36
|
+
end
|
33
37
|
end
|
38
|
+
@cleaned_tokens << token
|
34
39
|
end
|
35
|
-
cleaned_tokens << tokens[i]
|
36
40
|
end
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
+
|
42
|
+
def unknown_method1(token)
|
43
|
+
!abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_UNKNOWN1
|
44
|
+
end
|
45
|
+
|
46
|
+
def abbreviation?(token)
|
47
|
+
@abbreviations.include?(inverse_case(token))
|
41
48
|
end
|
42
|
-
|
43
|
-
|
44
|
-
|
49
|
+
|
50
|
+
def inverse_case(token)
|
51
|
+
@downcase ? token : Unicode.downcase(token)
|
45
52
|
end
|
46
|
-
|
47
|
-
|
53
|
+
|
54
|
+
def replace_last_token
|
55
|
+
last_token = @cleaned_tokens[-1]
|
56
|
+
return if abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_UNKNOWN2
|
57
|
+
@cleaned_tokens[-1] = Regexp.last_match(1)
|
58
|
+
@cleaned_tokens << DOT
|
59
|
+
end
|
60
|
+
|
48
61
|
end
|
62
|
+
|
49
63
|
end
|
@@ -15,14 +15,25 @@ module PragmaticTokenizer
|
|
15
15
|
EMOTICON_REGEX = /(?::|;|=)(?:-)?(?:\)|D|P)/
|
16
16
|
|
17
17
|
class SingleQuotes
|
18
|
+
|
19
|
+
REGEXP_LEFT_QUOTES1 = /(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o
|
20
|
+
REGEXP_LEFT_QUOTES2 = /(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o
|
21
|
+
REGEXP_LEFT_QUOTES3 = /(\W|^)'(?=.*\w)/o
|
22
|
+
REGEXP_RIGHT_SIDE_QUOTES = /(\w|\D)'(?!')(?=\W|$)/o
|
23
|
+
|
18
24
|
def handle_single_quotes(text)
|
19
25
|
# Convert left quotes to special character except for 'Twas or 'twas
|
20
|
-
|
21
|
-
text.gsub!(
|
22
|
-
text.gsub!(
|
23
|
-
|
24
|
-
|
26
|
+
replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
|
27
|
+
text.gsub!(REGEXP_LEFT_QUOTES1, "\\1 #{replacement} ")
|
28
|
+
text.gsub!(REGEXP_LEFT_QUOTES3, ' ' << replacement)
|
29
|
+
text.gsub!(REGEXP_RIGHT_SIDE_QUOTES, "\\1 #{replacement} ")
|
30
|
+
|
31
|
+
replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘".freeze]
|
32
|
+
text.gsub!(REGEXP_LEFT_QUOTES2, "\\1 #{replacement} ")
|
33
|
+
|
34
|
+
text
|
25
35
|
end
|
36
|
+
|
26
37
|
end
|
27
38
|
end
|
28
39
|
end
|
@@ -95,16 +95,29 @@ module PragmaticTokenizer
|
|
95
95
|
"will-o'-the-wisp" => "will-of-the-wisp",
|
96
96
|
"'twas" => "it was"
|
97
97
|
}.freeze
|
98
|
+
|
98
99
|
class SingleQuotes
|
100
|
+
|
101
|
+
REGEXP_LEFT_QUOTES1 = /(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o
|
102
|
+
REGEXP_LEFT_QUOTES2 = /(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o
|
103
|
+
REGEXP_LEFT_QUOTES3 = /(\W|^)'(?=.*\w)/o
|
104
|
+
REGEXP_RIGHT_SIDE_QUOTES = /(\w|\D)'(?!')(?=\W|$)/o
|
105
|
+
|
99
106
|
def handle_single_quotes(text)
|
100
107
|
# Convert left quotes to special character except for 'Twas or 'twas
|
101
|
-
|
102
|
-
text.gsub!(
|
103
|
-
text.gsub!(
|
104
|
-
|
105
|
-
|
108
|
+
replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
|
109
|
+
text.gsub!(REGEXP_LEFT_QUOTES1, "\\1 #{replacement} ")
|
110
|
+
text.gsub!(REGEXP_LEFT_QUOTES3, ' ' << replacement)
|
111
|
+
text.gsub!(REGEXP_RIGHT_SIDE_QUOTES, "\\1 #{replacement} ")
|
112
|
+
|
113
|
+
replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘".freeze]
|
114
|
+
text.gsub!(REGEXP_LEFT_QUOTES2, "\\1 #{replacement} ")
|
115
|
+
|
116
|
+
text
|
106
117
|
end
|
118
|
+
|
107
119
|
end
|
120
|
+
|
108
121
|
end
|
109
122
|
end
|
110
123
|
end
|
@@ -7,11 +7,17 @@ module PragmaticTokenizer
|
|
7
7
|
CONTRACTIONS = {}.freeze
|
8
8
|
|
9
9
|
class SingleQuotes
|
10
|
+
|
11
|
+
REGEXP_UNKNOWN1 = /(\w|\D)'(?!')(?=\W|$)/o
|
12
|
+
REGEXP_UNKNOWN2 = /(\W|^)'(?=.*\w)/o
|
13
|
+
|
10
14
|
def handle_single_quotes(text)
|
11
|
-
|
12
|
-
text.gsub!(
|
13
|
-
text.gsub!(
|
14
|
-
text.gsub!(/
|
15
|
+
replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
|
16
|
+
text.gsub!(REGEXP_UNKNOWN1, "\\1 #{replacement} ")
|
17
|
+
text.gsub!(REGEXP_UNKNOWN2, ' ' << replacement)
|
18
|
+
text.gsub!(/l\'/, '\1 l☮ \2')
|
19
|
+
text.gsub!(/L\'/, '\1 L☮ \2')
|
20
|
+
text
|
15
21
|
end
|
16
22
|
end
|
17
23
|
end
|
@@ -20,6 +20,7 @@ module PragmaticTokenizer
|
|
20
20
|
|
21
21
|
REGEX_UNIFIED2 = Regexp.union(REGEXP_SINGLE_QUOTES,
|
22
22
|
REGEXP_COMMAS)
|
23
|
+
REGEXP_UNKNOWN1 = /(?<=\S)([。.!!??]+)$/
|
23
24
|
|
24
25
|
attr_reader :text, :abbreviations, :downcase
|
25
26
|
|
@@ -30,17 +31,21 @@ module PragmaticTokenizer
|
|
30
31
|
end
|
31
32
|
|
32
33
|
def post_process
|
33
|
-
|
34
|
+
separate_ending_punctuation(method_name3)
|
34
35
|
end
|
35
36
|
|
36
37
|
private
|
37
38
|
|
38
39
|
def method_name3
|
39
|
-
separated =
|
40
|
+
separated = separate_ending_punctuation(full_stop_separated_tokens)
|
40
41
|
procs = [unified1, split_unknown_period1, split_unknown_period2, split_emoji]
|
41
42
|
procs.reduce(separated) { |a, e| a.flat_map(&e) }
|
42
43
|
end
|
43
44
|
|
45
|
+
def separate_ending_punctuation(tokens)
|
46
|
+
tokens.flat_map { |token| token.split(REGEXP_UNKNOWN1) }
|
47
|
+
end
|
48
|
+
|
44
49
|
def unified1
|
45
50
|
proc { |token| token.split(REGEX_UNIFIED1) }
|
46
51
|
end
|
@@ -91,11 +96,8 @@ module PragmaticTokenizer
|
|
91
96
|
end
|
92
97
|
|
93
98
|
def extract_abbreviation(token)
|
94
|
-
|
95
|
-
|
96
|
-
else
|
97
|
-
Unicode.downcase(token.split(/(\.)/)[0])
|
98
|
-
end
|
99
|
+
before_first_dot = token[0, token.index('.'.freeze)]
|
100
|
+
downcase ? before_first_dot : Unicode.downcase(before_first_dot)
|
99
101
|
end
|
100
102
|
|
101
103
|
def convert_sym_to_punct(token)
|
@@ -59,15 +59,15 @@ module PragmaticTokenizer
|
|
59
59
|
end
|
60
60
|
|
61
61
|
def shift_horizontal_ellipsis!
|
62
|
-
gsub!(/(…+)/o
|
62
|
+
gsub!(/(…+)/o, ' \1 ')
|
63
63
|
end
|
64
64
|
|
65
65
|
def shift_ellipse_two_dots!
|
66
|
-
gsub!(/(\.\.+)/o
|
66
|
+
gsub!(/(\.\.+)/o, ' \1 ')
|
67
67
|
end
|
68
68
|
|
69
69
|
def shift_ellipse_three_dots!
|
70
|
-
gsub!(/(\.\.\.+)/o
|
70
|
+
gsub!(/(\.\.\.+)/o, ' \1 ')
|
71
71
|
end
|
72
72
|
|
73
73
|
def shift_no_space_mention!
|
@@ -98,11 +98,11 @@ module PragmaticTokenizer
|
|
98
98
|
end
|
99
99
|
|
100
100
|
def shift_bracket!
|
101
|
-
gsub!(/([\(\[\{\}\]\)])/o
|
101
|
+
gsub!(/([\(\[\{\}\]\)])/o, ' \1 ')
|
102
102
|
end
|
103
103
|
|
104
104
|
def shift_semicolon!
|
105
|
-
gsub!(/([;])/o
|
105
|
+
gsub!(/([;])/o, ' \1 ')
|
106
106
|
end
|
107
107
|
|
108
108
|
def shift_percent!
|
@@ -138,7 +138,7 @@ module PragmaticTokenizer
|
|
138
138
|
|
139
139
|
def replace_left_quotes!(style, replacement_key)
|
140
140
|
replacement = replacement_for_key(replacement_key)
|
141
|
-
gsub!(/#{style}(?=.*\w)/o, ' '
|
141
|
+
gsub!(/#{style}(?=.*\w)/o, ' ' << replacement << ' ')
|
142
142
|
end
|
143
143
|
|
144
144
|
def replace_remaining_double_quotes!
|
@@ -149,7 +149,7 @@ module PragmaticTokenizer
|
|
149
149
|
|
150
150
|
def replace_remaining_quotes!(style, replacement_key)
|
151
151
|
replacement = replacement_for_key(replacement_key)
|
152
|
-
gsub!(/#{style}/, ' '
|
152
|
+
gsub!(/#{style}/, ' ' << replacement << ' ')
|
153
153
|
end
|
154
154
|
|
155
155
|
def convert_sgl_quotes!(language)
|
@@ -3,8 +3,8 @@ require 'pragmatic_tokenizer/languages'
|
|
3
3
|
require 'pragmatic_tokenizer/pre_processor'
|
4
4
|
require 'pragmatic_tokenizer/post_processor'
|
5
5
|
require 'pragmatic_tokenizer/full_stop_separator'
|
6
|
-
require 'pragmatic_tokenizer/ending_punctuation_separator'
|
7
6
|
require 'unicode'
|
7
|
+
require 'set'
|
8
8
|
|
9
9
|
module PragmaticTokenizer
|
10
10
|
class Tokenizer
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -111,7 +111,6 @@ files:
|
|
111
111
|
- README.md
|
112
112
|
- Rakefile
|
113
113
|
- lib/pragmatic_tokenizer.rb
|
114
|
-
- lib/pragmatic_tokenizer/ending_punctuation_separator.rb
|
115
114
|
- lib/pragmatic_tokenizer/full_stop_separator.rb
|
116
115
|
- lib/pragmatic_tokenizer/languages.rb
|
117
116
|
- lib/pragmatic_tokenizer/languages/arabic.rb
|
@@ -1,31 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module PragmaticTokenizer
|
4
|
-
# This class separates ending punctuation from a token
|
5
|
-
class EndingPunctuationSeparator
|
6
|
-
attr_reader :tokens
|
7
|
-
def initialize(tokens:)
|
8
|
-
@tokens = tokens
|
9
|
-
end
|
10
|
-
|
11
|
-
def separate
|
12
|
-
cleaned_tokens = []
|
13
|
-
tokens.each do |a|
|
14
|
-
split_punctuation = a.scan(/(?<=\S)[。.!!??]+$/)
|
15
|
-
if split_punctuation[0].nil?
|
16
|
-
cleaned_tokens << a
|
17
|
-
else
|
18
|
-
cleaned_tokens << a.tr(split_punctuation[0], '')
|
19
|
-
if split_punctuation[0].length.eql?(1)
|
20
|
-
cleaned_tokens << split_punctuation[0]
|
21
|
-
else
|
22
|
-
split_punctuation[0].split("").each do |s|
|
23
|
-
cleaned_tokens << s
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
cleaned_tokens
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|