pragmatic_segmenter 0.3.4 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +24 -21
- data/lib/pragmatic_segmenter/languages/arabic.rb +2 -1
- data/lib/pragmatic_segmenter/languages/deutsch.rb +2 -1
- data/lib/pragmatic_segmenter/languages/persian.rb +2 -1
- data/lib/pragmatic_segmenter/languages/russian.rb +4 -3
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +0 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 805bb57215b38dc30f107be60c18c141d8dc1297
|
4
|
+
data.tar.gz: 389bf5f6700f44cb255279a4391c849d0ac67f69
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 749c13c831913bcd606f0d08194d09ca982c7e033c59c8520bf1b20a4146934725ab8252bc54362815cd34f5ea45f8c5ed1cde2da80b8beb506093bd86138522
|
7
|
+
data.tar.gz: d5f05492994604c023fc357b7b030878eb2d8554f2d898efe196530a5777d394359acd4e8c600451cec511df4e2dfafa30cc34d2589a8d983ae3b11299aa9869
|
@@ -67,7 +67,7 @@ module PragmaticSegmenter
|
|
67
67
|
# Some might say that the set of words that follow an
|
68
68
|
# abbreviation such as U.S. (i.e. U.S. Government) is smaller than
|
69
69
|
# the set of words that could start a sentence and
|
70
|
-
# never follow U.S. However, we
|
70
|
+
# never follow U.S. However, we are being conservative
|
71
71
|
# and not splitting by default, so we need to look for places
|
72
72
|
# where we definitely can split. Obviously SENTENCE_STARTERS
|
73
73
|
# will never cover all cases, but as the gem is named
|
@@ -76,17 +76,17 @@ module PragmaticSegmenter
|
|
76
76
|
# sentence but could never follow one of the abbreviations below.
|
77
77
|
|
78
78
|
SENTENCE_STARTERS.each do |word|
|
79
|
-
txt
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
79
|
+
txt.gsub!(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
|
80
|
+
txt.gsub!(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
|
81
|
+
txt.gsub!(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
|
82
|
+
txt.gsub!(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
|
83
|
+
txt.gsub!(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
|
84
|
+
txt.gsub!(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
|
85
|
+
txt.gsub!(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
|
86
|
+
txt.gsub!(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
|
87
|
+
txt.gsub!(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
|
88
|
+
txt.gsub!(/i.v∯\s#{Regexp.escape(word)}\s/, "i\.v\.\s#{Regexp.escape(word)}\s")
|
89
|
+
txt.gsub!(/I.V∯\s#{Regexp.escape(word)}\s/, "I\.V\.\s#{Regexp.escape(word)}\s")
|
90
90
|
end
|
91
91
|
txt
|
92
92
|
end
|
@@ -95,29 +95,32 @@ module PragmaticSegmenter
|
|
95
95
|
mpa = txt.scan(@language::MULTI_PERIOD_ABBREVIATION_REGEX)
|
96
96
|
return txt if mpa.empty?
|
97
97
|
mpa.each do |r|
|
98
|
-
txt
|
98
|
+
txt.gsub!(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
|
99
99
|
end
|
100
100
|
txt
|
101
101
|
end
|
102
102
|
|
103
103
|
def replace_pre_number_abbr(txt, abbr)
|
104
|
-
txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
|
105
|
-
|
106
|
-
|
104
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
|
105
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
|
106
|
+
txt
|
107
107
|
end
|
108
108
|
|
109
109
|
def replace_prepositive_abbr(txt, abbr)
|
110
|
-
txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
|
111
|
-
|
110
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
|
111
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
|
112
|
+
txt
|
112
113
|
end
|
113
114
|
|
114
115
|
def replace_period_of_abbr(txt, abbr)
|
115
|
-
txt.gsub(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
|
116
|
-
|
116
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
|
117
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
|
118
|
+
txt
|
117
119
|
end
|
118
120
|
|
119
121
|
def replace_possessive_abbreviations(txt)
|
120
|
-
txt.gsub(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
|
122
|
+
txt.gsub!(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
|
123
|
+
txt
|
121
124
|
end
|
122
125
|
end
|
123
126
|
end
|
@@ -13,9 +13,10 @@ module PragmaticSegmenter
|
|
13
13
|
private
|
14
14
|
|
15
15
|
def replace_period_of_abbr(txt, abbr)
|
16
|
-
txt.gsub(/(?<=\s#{abbr.strip})\./, '∯')
|
17
|
-
|
18
|
-
|
16
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\./, '∯')
|
17
|
+
txt.gsub!(/(?<=\A#{abbr.strip})\./, '∯')
|
18
|
+
txt.gsub!(/(?<=^#{abbr.strip})\./, '∯')
|
19
|
+
txt
|
19
20
|
end
|
20
21
|
end
|
21
22
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -160,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
160
160
|
version: '0'
|
161
161
|
requirements: []
|
162
162
|
rubyforge_project:
|
163
|
-
rubygems_version: 2.4.
|
163
|
+
rubygems_version: 2.4.1
|
164
164
|
signing_key:
|
165
165
|
specification_version: 4
|
166
166
|
summary: A rule-based sentence boundary detection gem that works out-of-the-box across
|