pragmatic_segmenter 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +24 -21
- data/lib/pragmatic_segmenter/languages/arabic.rb +2 -1
- data/lib/pragmatic_segmenter/languages/deutsch.rb +2 -1
- data/lib/pragmatic_segmenter/languages/persian.rb +2 -1
- data/lib/pragmatic_segmenter/languages/russian.rb +4 -3
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +0 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 805bb57215b38dc30f107be60c18c141d8dc1297
|
4
|
+
data.tar.gz: 389bf5f6700f44cb255279a4391c849d0ac67f69
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 749c13c831913bcd606f0d08194d09ca982c7e033c59c8520bf1b20a4146934725ab8252bc54362815cd34f5ea45f8c5ed1cde2da80b8beb506093bd86138522
|
7
|
+
data.tar.gz: d5f05492994604c023fc357b7b030878eb2d8554f2d898efe196530a5777d394359acd4e8c600451cec511df4e2dfafa30cc34d2589a8d983ae3b11299aa9869
|
@@ -67,7 +67,7 @@ module PragmaticSegmenter
|
|
67
67
|
# Some might say that the set of words that follow an
|
68
68
|
# abbreviation such as U.S. (i.e. U.S. Government) is smaller than
|
69
69
|
# the set of words that could start a sentence and
|
70
|
-
# never follow U.S. However, we
|
70
|
+
# never follow U.S. However, we are being conservative
|
71
71
|
# and not splitting by default, so we need to look for places
|
72
72
|
# where we definitely can split. Obviously SENTENCE_STARTERS
|
73
73
|
# will never cover all cases, but as the gem is named
|
@@ -76,17 +76,17 @@ module PragmaticSegmenter
|
|
76
76
|
# sentence but could never follow one of the abbreviations below.
|
77
77
|
|
78
78
|
SENTENCE_STARTERS.each do |word|
|
79
|
-
txt
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
79
|
+
txt.gsub!(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
|
80
|
+
txt.gsub!(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
|
81
|
+
txt.gsub!(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
|
82
|
+
txt.gsub!(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
|
83
|
+
txt.gsub!(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
|
84
|
+
txt.gsub!(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
|
85
|
+
txt.gsub!(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
|
86
|
+
txt.gsub!(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
|
87
|
+
txt.gsub!(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
|
88
|
+
txt.gsub!(/i.v∯\s#{Regexp.escape(word)}\s/, "i\.v\.\s#{Regexp.escape(word)}\s")
|
89
|
+
txt.gsub!(/I.V∯\s#{Regexp.escape(word)}\s/, "I\.V\.\s#{Regexp.escape(word)}\s")
|
90
90
|
end
|
91
91
|
txt
|
92
92
|
end
|
@@ -95,29 +95,32 @@ module PragmaticSegmenter
|
|
95
95
|
mpa = txt.scan(@language::MULTI_PERIOD_ABBREVIATION_REGEX)
|
96
96
|
return txt if mpa.empty?
|
97
97
|
mpa.each do |r|
|
98
|
-
txt
|
98
|
+
txt.gsub!(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
|
99
99
|
end
|
100
100
|
txt
|
101
101
|
end
|
102
102
|
|
103
103
|
def replace_pre_number_abbr(txt, abbr)
|
104
|
-
txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
|
105
|
-
|
106
|
-
|
104
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
|
105
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
|
106
|
+
txt
|
107
107
|
end
|
108
108
|
|
109
109
|
def replace_prepositive_abbr(txt, abbr)
|
110
|
-
txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
|
111
|
-
|
110
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
|
111
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
|
112
|
+
txt
|
112
113
|
end
|
113
114
|
|
114
115
|
def replace_period_of_abbr(txt, abbr)
|
115
|
-
txt.gsub(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
|
116
|
-
|
116
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
|
117
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
|
118
|
+
txt
|
117
119
|
end
|
118
120
|
|
119
121
|
def replace_possessive_abbreviations(txt)
|
120
|
-
txt.gsub(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
|
122
|
+
txt.gsub!(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
|
123
|
+
txt
|
121
124
|
end
|
122
125
|
end
|
123
126
|
end
|
@@ -13,9 +13,10 @@ module PragmaticSegmenter
|
|
13
13
|
private
|
14
14
|
|
15
15
|
def replace_period_of_abbr(txt, abbr)
|
16
|
-
txt.gsub(/(?<=\s#{abbr.strip})\./, '∯')
|
17
|
-
|
18
|
-
|
16
|
+
txt.gsub!(/(?<=\s#{abbr.strip})\./, '∯')
|
17
|
+
txt.gsub!(/(?<=\A#{abbr.strip})\./, '∯')
|
18
|
+
txt.gsub!(/(?<=^#{abbr.strip})\./, '∯')
|
19
|
+
txt
|
19
20
|
end
|
20
21
|
end
|
21
22
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -160,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
160
160
|
version: '0'
|
161
161
|
requirements: []
|
162
162
|
rubyforge_project:
|
163
|
-
rubygems_version: 2.4.
|
163
|
+
rubygems_version: 2.4.1
|
164
164
|
signing_key:
|
165
165
|
specification_version: 4
|
166
166
|
summary: A rule-based sentence boundary detection gem that works out-of-the-box across
|