pragmatic_segmenter 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e0545b8e2fe6446107740b5c458b96e76b6edc51
4
- data.tar.gz: 746d97aba038d8f23a6701d7df08205ff48203a8
3
+ metadata.gz: ae3798fa47a86a8928835153af20c91d181ab2d5
4
+ data.tar.gz: 265670562de5e8b25aa90454919f044c910353f8
5
5
  SHA512:
6
- metadata.gz: 5975faedda7f913678ea122317266722895376da90be3b4094e50d61d2eef1e531b3df890d93199393f1945a053e14646e8d2b7bc73287de9250751f332483aa
7
- data.tar.gz: d48fcd09e289833f82a5e6aa5915b3faa5ecf4417874273a2b8bb7640f51454f1374736449478da8328006277b0727c16b3b8badd258791fb011dd23f351266e
6
+ metadata.gz: 33eea4d021662c497763950fb5815e29b59214d2c4d7056f77f081ea3edc77a0fc9c54a9467d89dc5d278174e316ced772a6f826bb477c711239eb2b0d0b1722
7
+ data.tar.gz: 73fb101b5a2c6a3d2f57bdede1d37a70286519dd54ca93aaa2cfc467dd5202961bd3f27b3f25c906ea67e3038542823303bbb411bb4ffd6fafd9f7145f3aa116
data/README.md CHANGED
@@ -407,6 +407,12 @@ One further habit which was somewhat weakened . . . was that of combining words
407
407
  => ["One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."]
408
408
  ```
409
409
 
410
+ 52.) **No whitespace in between sentences** *Credit: Don_Patrick*
411
+ ```
412
+ Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot.
413
+ => ["Hello world.", "Today is Tuesday.", "Mr. Smith went to the store and bought 1,000.", "That is a lot."]
414
+ ```
415
+
410
416
  ####Golden Rules (German)
411
417
 
412
418
  1.) **Quotation at end of sentence**
@@ -80,7 +80,7 @@ module PragmaticSegmenter
80
80
  end
81
81
 
82
82
  def abbreviations
83
- PragmaticSegmenter::Abbreviation.new
83
+ @abbr ||= PragmaticSegmenter::Abbreviation.new
84
84
  end
85
85
 
86
86
  def replace_abbreviation_as_sentence_boundary(txt)
@@ -17,6 +17,14 @@ module PragmaticSegmenter
17
17
  # xhtml, inline formatting, etc.
18
18
  class Cleaner
19
19
  include Rules
20
+ URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']
21
+
22
+ # Rubular: http://rubular.com/r/6dt98uI76u
23
+ NO_SPACE_BETWEEN_SENTENCES_REGEX = /(?<=[a-z])\.(?=[A-Z])/
24
+
25
+ # Rubular: http://rubular.com/r/l6KN6rH5XE
26
+ NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = /(?<=\d)\.(?=[A-Z])/
27
+
20
28
  # Rubular: http://rubular.com/r/V57WnM9Zut
21
29
  NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
22
30
 
@@ -53,9 +61,17 @@ module PragmaticSegmenter
53
61
  # Rubular: http://rubular.com/r/IQ4TPfsbd8
54
62
  ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
55
63
 
64
+ # Rubular: http://rubular.com/r/6dt98uI76u
65
+ NoSpaceBetweenSentencesRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
66
+
67
+ # Rubular: http://rubular.com/r/l6KN6rH5XE
68
+ NoSpaceBetweenSentencesDigitRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
69
+
56
70
  EscapedCarriageReturnRule = Rule.new(/\\r/, "\r")
71
+ TypoEscapedCarriageReturnRule = Rule.new(/\\\ r/, "\r")
57
72
 
58
73
  EscapedNewLineRule = Rule.new(/\\n/, "\n")
74
+ TypoEscapedNewLineRule = Rule.new(/\\\ n/, "\n")
59
75
 
60
76
  ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
61
77
 
@@ -92,11 +108,36 @@ module PragmaticSegmenter
92
108
  @clean_text.apply(InlineFormattingRule)
93
109
  clean_quotations(@clean_text)
94
110
  clean_table_of_contents(@clean_text)
111
+ check_for_no_space_in_between_sentences(@clean_text)
95
112
  clean_consecutive_characters(@clean_text)
96
113
  end
97
114
 
98
115
  private
99
116
 
117
+ def check_for_no_space_in_between_sentences(txt)
118
+ words = txt.split(' ')
119
+ words.each do |word|
120
+ search_for_connected_sentences(word, txt, NO_SPACE_BETWEEN_SENTENCES_REGEX, NoSpaceBetweenSentencesRule)
121
+ search_for_connected_sentences(word, txt, NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, NoSpaceBetweenSentencesDigitRule)
122
+ end
123
+ txt
124
+ end
125
+
126
+ def search_for_connected_sentences(word, txt, regex, rule)
127
+ if word =~ regex
128
+ unless URL_EMAIL_KEYWORDS.any? { |web| word =~ /#{web}/ }
129
+ unless abbreviations.any? { |abbr| word =~ /#{abbr}/i }
130
+ new_word = word.dup.apply(rule)
131
+ txt.gsub!(/#{Regexp.escape(word)}/, new_word)
132
+ end
133
+ end
134
+ end
135
+ end
136
+
137
+ def abbreviations
138
+ @abbr ||= PragmaticSegmenter::Abbreviation.new.all
139
+ end
140
+
100
141
  def remove_all_newlines(txt)
101
142
  clean_text = remove_newline_in_middle_of_sentence(txt)
102
143
  remove_newline_in_middle_of_word(clean_text)
@@ -118,7 +159,9 @@ module PragmaticSegmenter
118
159
 
119
160
  def replace_escaped_newlines(txt)
120
161
  txt.apply(EscapedNewLineRule).
121
- apply(EscapedCarriageReturnRule)
162
+ apply(EscapedCarriageReturnRule).
163
+ apply(TypoEscapedNewLineRule).
164
+ apply(TypoEscapedCarriageReturnRule)
122
165
  end
123
166
 
124
167
  def replace_double_newlines(txt)
@@ -18,6 +18,11 @@ module PragmaticSegmenter
18
18
  end
19
19
 
20
20
  class Cleaner < PragmaticSegmenter::Cleaner
21
+ private
22
+
23
+ def abbreviations
24
+ PragmaticSegmenter::Languages::Deutsch::Abbreviation.new.all
25
+ end
21
26
  end
22
27
 
23
28
  class Number < PragmaticSegmenter::Number
@@ -58,7 +63,7 @@ module PragmaticSegmenter
58
63
  end
59
64
 
60
65
  class Abbreviation < PragmaticSegmenter::Abbreviation
61
- ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b ', 'z.t ', 'z.z', 'z.zt', 'zt', 'zzt']
66
+ ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt']
62
67
  NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
63
68
 
64
69
  def all
@@ -15,13 +15,17 @@ module PragmaticSegmenter
15
15
  def clean_quotations(txt)
16
16
  txt.gsub(/`/, "'")
17
17
  end
18
+
19
+ def abbreviations
20
+ []
21
+ end
18
22
  end
19
23
 
20
24
  class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer
21
25
  private
22
26
 
23
27
  def abbreviations
24
- PragmaticSegmenter::Languages::English::Abbreviation.new
28
+ PragmaticSegmenter::Abbreviation.new
25
29
  end
26
30
  end
27
31
  end
@@ -5,6 +5,11 @@ module PragmaticSegmenter
5
5
  end
6
6
 
7
7
  class Cleaner < PragmaticSegmenter::Cleaner
8
+ private
9
+
10
+ def abbreviations
11
+ PragmaticSegmenter::Languages::French::Abbreviation.new.all
12
+ end
8
13
  end
9
14
 
10
15
  class Abbreviation < PragmaticSegmenter::Abbreviation
@@ -10,6 +10,11 @@ module PragmaticSegmenter
10
10
  end
11
11
 
12
12
  class Cleaner < PragmaticSegmenter::Cleaner
13
+ private
14
+
15
+ def abbreviations
16
+ PragmaticSegmenter::Languages::Italian::Abbreviation.new.all
17
+ end
13
18
  end
14
19
 
15
20
  class Abbreviation < PragmaticSegmenter::Abbreviation
@@ -10,6 +10,11 @@ module PragmaticSegmenter
10
10
  end
11
11
 
12
12
  class Cleaner < PragmaticSegmenter::Cleaner
13
+ private
14
+
15
+ def abbreviations
16
+ PragmaticSegmenter::Languages::Russian::Abbreviation.new.all
17
+ end
13
18
  end
14
19
 
15
20
  class Abbreviation < PragmaticSegmenter::Abbreviation
@@ -10,6 +10,11 @@ module PragmaticSegmenter
10
10
  end
11
11
 
12
12
  class Cleaner < PragmaticSegmenter::Cleaner
13
+ private
14
+
15
+ def abbreviations
16
+ PragmaticSegmenter::Languages::Spanish::Abbreviation.new.all
17
+ end
13
18
  end
14
19
 
15
20
  class Abbreviation < PragmaticSegmenter::Abbreviation
@@ -29,7 +29,8 @@ module PragmaticSegmenter
29
29
  reformatted_text = PragmaticSegmenter::List.new(text: text).add_line_break
30
30
  reformatted_text = replace_abbreviations(reformatted_text)
31
31
  reformatted_text = replace_numbers(reformatted_text)
32
- reformatted_text = reformatted_text.apply(GeoLocationRule)
32
+ reformatted_text.apply(AbbreviationsWithMultiplePeriodsAndEmailRule)
33
+ reformatted_text.apply(GeoLocationRule)
33
34
  split_into_segments(reformatted_text)
34
35
  end
35
36
 
@@ -37,7 +38,7 @@ module PragmaticSegmenter
37
38
 
38
39
  def split_into_segments(txt)
39
40
  txt.split("\r")
40
- .map! { |segment| segment.apply(SingleNewLineRule, EllipsisRules::All, EmailRule) }
41
+ .map! { |segment| segment.apply(SingleNewLineRule, EllipsisRules::All) }
41
42
  .map { |segment| check_for_punctuation(segment) }.flatten
42
43
  .map! { |segment| segment.apply(SubSymbolsRules::All) }
43
44
  .map { |segment| post_process_segments(segment) }
@@ -1,7 +1,7 @@
1
1
  module PragmaticSegmenter
2
2
  module Rules
3
3
  # Rubular: http://rubular.com/r/EUbZCNfgei
4
- EmailRule = Rule.new(/(\w)(\.)(\w)/, '\1∮\3')
4
+ AbbreviationsWithMultiplePeriodsAndEmailRule = Rule.new(/(\w)(\.)(\w)/, '\1∮\3')
5
5
 
6
6
  # Rubular: http://rubular.com/r/G2opjedIm9
7
7
  GeoLocationRule = Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '∯')
@@ -32,7 +32,6 @@ module PragmaticSegmenter
32
32
  @language = args[:language] || 'en'
33
33
  @doc_type = args[:doc_type]
34
34
  @text = text.dup
35
-
36
35
  unless args[:clean].eql?(false)
37
36
  @text = cleaner_class.new(text: @text, doc_type: args[:doc_type]).clean
38
37
  end
@@ -40,7 +39,6 @@ module PragmaticSegmenter
40
39
 
41
40
  def segment
42
41
  return [] unless text
43
-
44
42
  process_class.new(text: text, doc_type: doc_type).process
45
43
  end
46
44
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
@@ -258,6 +258,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
258
258
  ps = PragmaticSegmenter::Segmenter.new(text: "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .", language: "en")
259
259
  expect(ps.segment).to eq(["One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."])
260
260
  end
261
+
262
+ it "No whitespace in between sentences #052" do
263
+ ps = PragmaticSegmenter::Segmenter.new(text: "Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot.", language: "en")
264
+ expect(ps.segment).to eq(["Hello world.", "Today is Tuesday.", "Mr. Smith went to the store and bought 1,000.", "That is a lot."])
265
+ end
261
266
  end
262
267
 
263
268
  context "Golden Rules (languages other than English)" do
@@ -873,6 +878,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
873
878
  ps = PragmaticSegmenter::Segmenter.new(text: 'Hello World. \r\n Hello.', language: 'en')
874
879
  expect(ps.segment).to eq(["Hello World.", "Hello."])
875
880
  end
881
+
882
+ it 'correctly segments text #082' do
883
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Hello World. \ r \ nHello.', language: 'en')
884
+ expect(ps.segment).to eq(["Hello World.", "Hello."])
885
+ end
876
886
  end
877
887
  end
878
888
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-10 00:00:00.000000000 Z
11
+ date: 2015-01-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler