pragmatic_segmenter 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a9cb6133aca84f8c6ff233ec6fb34b276cf47964
4
- data.tar.gz: 00c1f664707e86e5c2ae5740c53acde5c814ece8
3
+ metadata.gz: 805bb57215b38dc30f107be60c18c141d8dc1297
4
+ data.tar.gz: 389bf5f6700f44cb255279a4391c849d0ac67f69
5
5
  SHA512:
6
- metadata.gz: d5726605fa78ec4067c79ed592a7983f2638b26a81fb88cf23bdffeb26d842c0eaed39a531181ecef6456218208f6d297e4316b36fc7f4a15f4deb2ebb7cb800
7
- data.tar.gz: a1c99c7f3c73c1624a2b1d4792c8937dd27d3dfd4667dcb05844376d5ecad13ebe065f18fe3f40a61a4aa23baa3e8ab9b4dc3bc547821c6d9fc5700cd5a16f20
6
+ metadata.gz: 749c13c831913bcd606f0d08194d09ca982c7e033c59c8520bf1b20a4146934725ab8252bc54362815cd34f5ea45f8c5ed1cde2da80b8beb506093bd86138522
7
+ data.tar.gz: d5f05492994604c023fc357b7b030878eb2d8554f2d898efe196530a5777d394359acd4e8c600451cec511df4e2dfafa30cc34d2589a8d983ae3b11299aa9869
@@ -67,7 +67,7 @@ module PragmaticSegmenter
67
67
  # Some might say that the set of words that follow an
68
68
  # abbreviation such as U.S. (i.e. U.S. Government) is smaller than
69
69
  # the set of words that could start a sentence and
70
- # never follow U.S. However, we are being conservative
70
+ # never follow U.S. However, we are being conservative
71
71
  # and not splitting by default, so we need to look for places
72
72
  # where we definitely can split. Obviously SENTENCE_STARTERS
73
73
  # will never cover all cases, but as the gem is named
@@ -76,17 +76,17 @@ module PragmaticSegmenter
76
76
  # sentence but could never follow one of the abbreviations below.
77
77
 
78
78
  SENTENCE_STARTERS.each do |word|
79
- txt = txt.gsub(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
80
- .gsub(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
81
- .gsub(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
82
- .gsub(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
83
- .gsub(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
84
- .gsub(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
85
- .gsub(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
86
- .gsub(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
87
- .gsub(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
88
- .gsub(/i.v∯\s#{Regexp.escape(word)}\s/, "i\.v\.\s#{Regexp.escape(word)}\s")
89
- .gsub(/I.V∯\s#{Regexp.escape(word)}\s/, "I\.V\.\s#{Regexp.escape(word)}\s")
79
+ txt.gsub!(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
80
+ txt.gsub!(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
81
+ txt.gsub!(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
82
+ txt.gsub!(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
83
+ txt.gsub!(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
84
+ txt.gsub!(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
85
+ txt.gsub!(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
86
+ txt.gsub!(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
87
+ txt.gsub!(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
88
+ txt.gsub!(/i.v∯\s#{Regexp.escape(word)}\s/, "i\.v\.\s#{Regexp.escape(word)}\s")
89
+ txt.gsub!(/I.V∯\s#{Regexp.escape(word)}\s/, "I\.V\.\s#{Regexp.escape(word)}\s")
90
90
  end
91
91
  txt
92
92
  end
@@ -95,29 +95,32 @@ module PragmaticSegmenter
95
95
  mpa = txt.scan(@language::MULTI_PERIOD_ABBREVIATION_REGEX)
96
96
  return txt if mpa.empty?
97
97
  mpa.each do |r|
98
- txt = txt.gsub(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
98
+ txt.gsub!(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
99
99
  end
100
100
  txt
101
101
  end
102
102
 
103
103
  def replace_pre_number_abbr(txt, abbr)
104
- txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
105
- .gsub(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
106
-
104
+ txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
105
+ txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
106
+ txt
107
107
  end
108
108
 
109
109
  def replace_prepositive_abbr(txt, abbr)
110
- txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
111
- .gsub(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
110
+ txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
111
+ txt.gsub!(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
112
+ txt
112
113
  end
113
114
 
114
115
  def replace_period_of_abbr(txt, abbr)
115
- txt.gsub(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
116
- .gsub(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
116
+ txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
117
+ txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
118
+ txt
117
119
  end
118
120
 
119
121
  def replace_possessive_abbreviations(txt)
120
- txt.gsub(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
122
+ txt.gsub!(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
123
+ txt
121
124
  end
122
125
  end
123
126
  end
@@ -22,7 +22,8 @@ module PragmaticSegmenter
22
22
  private
23
23
 
24
24
  def scan_for_replacements(txt, am, index, character_array)
25
- txt.gsub(/(?<=#{am})\./, '∯')
25
+ txt.gsub!(/(?<=#{am})\./, '∯')
26
+ txt
26
27
  end
27
28
  end
28
29
  end
@@ -75,7 +75,8 @@ module PragmaticSegmenter
75
75
  private
76
76
 
77
77
  def scan_for_replacements(txt, am, index, character_array)
78
- txt.gsub(/(?<=#{am})\.(?=\s)/, '∯')
78
+ txt.gsub!(/(?<=#{am})\.(?=\s)/, '∯')
79
+ txt
79
80
  end
80
81
  end
81
82
 
@@ -13,7 +13,8 @@ module PragmaticSegmenter
13
13
  private
14
14
 
15
15
  def scan_for_replacements(txt, am, index, character_array)
16
- txt.gsub(/(?<=#{am})\./, '∯')
16
+ txt.gsub!(/(?<=#{am})\./, '∯')
17
+ txt
17
18
  end
18
19
  end
19
20
  end
@@ -13,9 +13,10 @@ module PragmaticSegmenter
13
13
  private
14
14
 
15
15
  def replace_period_of_abbr(txt, abbr)
16
- txt.gsub(/(?<=\s#{abbr.strip})\./, '∯')
17
- .gsub(/(?<=\A#{abbr.strip})\./, '∯')
18
- .gsub(/(?<=^#{abbr.strip})\./, '∯')
16
+ txt.gsub!(/(?<=\s#{abbr.strip})\./, '∯')
17
+ txt.gsub!(/(?<=\A#{abbr.strip})\./, '∯')
18
+ txt.gsub!(/(?<=^#{abbr.strip})\./, '∯')
19
+ txt
19
20
  end
20
21
  end
21
22
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.3.4"
2
+ VERSION = "0.3.5"
3
3
  end
@@ -58,5 +58,4 @@ RSpec.describe PragmaticSegmenter::Segmenter do
58
58
  end
59
59
  end
60
60
  end
61
-
62
61
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-12-22 00:00:00.000000000 Z
11
+ date: 2016-01-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -160,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
160
160
  version: '0'
161
161
  requirements: []
162
162
  rubyforge_project:
163
- rubygems_version: 2.4.8
163
+ rubygems_version: 2.4.1
164
164
  signing_key:
165
165
  specification_version: 4
166
166
  summary: A rule-based sentence boundary detection gem that works out-of-the-box across