pragmatic_segmenter 0.3.4 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a9cb6133aca84f8c6ff233ec6fb34b276cf47964
4
- data.tar.gz: 00c1f664707e86e5c2ae5740c53acde5c814ece8
3
+ metadata.gz: 805bb57215b38dc30f107be60c18c141d8dc1297
4
+ data.tar.gz: 389bf5f6700f44cb255279a4391c849d0ac67f69
5
5
  SHA512:
6
- metadata.gz: d5726605fa78ec4067c79ed592a7983f2638b26a81fb88cf23bdffeb26d842c0eaed39a531181ecef6456218208f6d297e4316b36fc7f4a15f4deb2ebb7cb800
7
- data.tar.gz: a1c99c7f3c73c1624a2b1d4792c8937dd27d3dfd4667dcb05844376d5ecad13ebe065f18fe3f40a61a4aa23baa3e8ab9b4dc3bc547821c6d9fc5700cd5a16f20
6
+ metadata.gz: 749c13c831913bcd606f0d08194d09ca982c7e033c59c8520bf1b20a4146934725ab8252bc54362815cd34f5ea45f8c5ed1cde2da80b8beb506093bd86138522
7
+ data.tar.gz: d5f05492994604c023fc357b7b030878eb2d8554f2d898efe196530a5777d394359acd4e8c600451cec511df4e2dfafa30cc34d2589a8d983ae3b11299aa9869
@@ -67,7 +67,7 @@ module PragmaticSegmenter
67
67
  # Some might say that the set of words that follow an
68
68
  # abbreviation such as U.S. (i.e. U.S. Government) is smaller than
69
69
  # the set of words that could start a sentence and
70
- # never follow U.S. However, we are being conservative
70
+ # never follow U.S. However, we are being conservative
71
71
  # and not splitting by default, so we need to look for places
72
72
  # where we definitely can split. Obviously SENTENCE_STARTERS
73
73
  # will never cover all cases, but as the gem is named
@@ -76,17 +76,17 @@ module PragmaticSegmenter
76
76
  # sentence but could never follow one of the abbreviations below.
77
77
 
78
78
  SENTENCE_STARTERS.each do |word|
79
- txt = txt.gsub(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
80
- .gsub(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
81
- .gsub(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
82
- .gsub(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
83
- .gsub(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
84
- .gsub(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
85
- .gsub(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
86
- .gsub(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
87
- .gsub(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
88
- .gsub(/i.v∯\s#{Regexp.escape(word)}\s/, "i\.v\.\s#{Regexp.escape(word)}\s")
89
- .gsub(/I.V∯\s#{Regexp.escape(word)}\s/, "I\.V\.\s#{Regexp.escape(word)}\s")
79
+ txt.gsub!(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
80
+ txt.gsub!(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
81
+ txt.gsub!(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
82
+ txt.gsub!(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
83
+ txt.gsub!(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
84
+ txt.gsub!(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
85
+ txt.gsub!(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
86
+ txt.gsub!(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
87
+ txt.gsub!(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
88
+ txt.gsub!(/i.v∯\s#{Regexp.escape(word)}\s/, "i\.v\.\s#{Regexp.escape(word)}\s")
89
+ txt.gsub!(/I.V∯\s#{Regexp.escape(word)}\s/, "I\.V\.\s#{Regexp.escape(word)}\s")
90
90
  end
91
91
  txt
92
92
  end
@@ -95,29 +95,32 @@ module PragmaticSegmenter
95
95
  mpa = txt.scan(@language::MULTI_PERIOD_ABBREVIATION_REGEX)
96
96
  return txt if mpa.empty?
97
97
  mpa.each do |r|
98
- txt = txt.gsub(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
98
+ txt.gsub!(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
99
99
  end
100
100
  txt
101
101
  end
102
102
 
103
103
  def replace_pre_number_abbr(txt, abbr)
104
- txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
105
- .gsub(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
106
-
104
+ txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
105
+ txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
106
+ txt
107
107
  end
108
108
 
109
109
  def replace_prepositive_abbr(txt, abbr)
110
- txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
111
- .gsub(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
110
+ txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
111
+ txt.gsub!(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
112
+ txt
112
113
  end
113
114
 
114
115
  def replace_period_of_abbr(txt, abbr)
115
- txt.gsub(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
116
- .gsub(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
116
+ txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
117
+ txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
118
+ txt
117
119
  end
118
120
 
119
121
  def replace_possessive_abbreviations(txt)
120
- txt.gsub(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
122
+ txt.gsub!(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
123
+ txt
121
124
  end
122
125
  end
123
126
  end
@@ -22,7 +22,8 @@ module PragmaticSegmenter
22
22
  private
23
23
 
24
24
  def scan_for_replacements(txt, am, index, character_array)
25
- txt.gsub(/(?<=#{am})\./, '∯')
25
+ txt.gsub!(/(?<=#{am})\./, '∯')
26
+ txt
26
27
  end
27
28
  end
28
29
  end
@@ -75,7 +75,8 @@ module PragmaticSegmenter
75
75
  private
76
76
 
77
77
  def scan_for_replacements(txt, am, index, character_array)
78
- txt.gsub(/(?<=#{am})\.(?=\s)/, '∯')
78
+ txt.gsub!(/(?<=#{am})\.(?=\s)/, '∯')
79
+ txt
79
80
  end
80
81
  end
81
82
 
@@ -13,7 +13,8 @@ module PragmaticSegmenter
13
13
  private
14
14
 
15
15
  def scan_for_replacements(txt, am, index, character_array)
16
- txt.gsub(/(?<=#{am})\./, '∯')
16
+ txt.gsub!(/(?<=#{am})\./, '∯')
17
+ txt
17
18
  end
18
19
  end
19
20
  end
@@ -13,9 +13,10 @@ module PragmaticSegmenter
13
13
  private
14
14
 
15
15
  def replace_period_of_abbr(txt, abbr)
16
- txt.gsub(/(?<=\s#{abbr.strip})\./, '∯')
17
- .gsub(/(?<=\A#{abbr.strip})\./, '∯')
18
- .gsub(/(?<=^#{abbr.strip})\./, '∯')
16
+ txt.gsub!(/(?<=\s#{abbr.strip})\./, '∯')
17
+ txt.gsub!(/(?<=\A#{abbr.strip})\./, '∯')
18
+ txt.gsub!(/(?<=^#{abbr.strip})\./, '∯')
19
+ txt
19
20
  end
20
21
  end
21
22
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.3.4"
2
+ VERSION = "0.3.5"
3
3
  end
@@ -58,5 +58,4 @@ RSpec.describe PragmaticSegmenter::Segmenter do
58
58
  end
59
59
  end
60
60
  end
61
-
62
61
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-12-22 00:00:00.000000000 Z
11
+ date: 2016-01-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -160,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
160
160
  version: '0'
161
161
  requirements: []
162
162
  rubyforge_project:
163
- rubygems_version: 2.4.8
163
+ rubygems_version: 2.4.1
164
164
  signing_key:
165
165
  specification_version: 4
166
166
  summary: A rule-based sentence boundary detection gem that works out-of-the-box across