pragmatic_segmenter 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1cf05fd20f9672f1186e1c4f857f6045d2fcf4b8
4
- data.tar.gz: f8c2685f66e2a177f18683bae290dfc892c8b67f
3
+ metadata.gz: ebda4e8fba93e6cc9fbaeb57fe71e0d3ae47d721
4
+ data.tar.gz: 3b2e121d592797dea12efc587c198082a4a23955
5
5
  SHA512:
6
- metadata.gz: b45bc699fa3cb055c54a028f87c803889ac3889ba454534abf794c9fc5cce36983172898955723c0c4b05ec040320e0126fa36fdf565897ca358aa0d34b1db8f
7
- data.tar.gz: 65384a1dd4e0b43a03925447e7dbf4ce7c8fe1f4c58ee86c3b660f1a33e3783a1cc9c73235ff600b4c32c140a15a54955eb55a117e83ddf4b0d4064a2616e533
6
+ metadata.gz: 169f112a82005dcd9b399b8337ef0e25afffde10f6a30f4c038f7621550dd3696c4ad054725763734d420de6445577946c8fc6c249e1d0ff13f5788adde0b9b9
7
+ data.tar.gz: 9b4b51797b69972e30adc23f948de2a26bc9a9e160ce1271c8f869055e466be83e451bc36eed44020d1e6ee903922d3594ef254e01df337ee884ca80a2a72965
data/README.md CHANGED
@@ -754,6 +754,9 @@ To test the relative performance of different segmentation tools and libraries I
754
754
  * Fix missing abbreviations
755
755
  * Add footnote rule to `cleaner.rb`
756
756
 
757
+ **Version 0.1.3**
758
+ * Improve punctuation in bracket replacement
759
+
757
760
  ## Contributing
758
761
 
759
762
  If you find a text that is incorrectly segmented using this gem, please submit an issue.
@@ -136,16 +136,19 @@ module PragmaticSegmenter
136
136
  end
137
137
 
138
138
  def replace_pre_number_abbr(txt, abbr)
139
- txt.gsub(/(?<=#{abbr.strip})\.(?=\s\d)/, '∯').gsub(/(?<=#{abbr.strip})\.(?=\s+\()/, '∯')
139
+ txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
140
+ .gsub(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
141
+
140
142
  end
141
143
 
142
144
  def replace_prepositive_abbr(txt, abbr)
143
- txt.gsub(/(?<=#{abbr.strip})\.(?=\s)/, '∯')
145
+ txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
146
+ .gsub(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
144
147
  end
145
148
 
146
149
  def replace_period_of_abbr(txt, abbr)
147
- txt.gsub(/(?<=#{abbr.strip})\.(?=((\.|:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
148
- .gsub(/(?<=#{abbr.strip})\.(?=,)/, '∯')
150
+ txt.gsub(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
151
+ .gsub(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
149
152
  end
150
153
 
151
154
  def replace_possessive_abbreviations(txt)
@@ -61,9 +61,6 @@ module PragmaticSegmenter
61
61
  # Rubular: http://rubular.com/r/IQ4TPfsbd8
62
62
  ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
63
63
 
64
- # Rubular: http://rubular.com/r/gEjxQ0HmSD
65
- FootnoteRule = Rule.new(/\[\?\]/, '[&ᓷ&]')
66
-
67
64
  # Rubular: http://rubular.com/r/6dt98uI76u
68
65
  NoSpaceBetweenSentencesRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
69
66
 
@@ -108,7 +105,8 @@ module PragmaticSegmenter
108
105
  replace_newlines(@clean_text)
109
106
  replace_escaped_newlines(@clean_text)
110
107
  @clean_text.apply(HtmlRules::All)
111
- @clean_text.apply(InlineFormattingRule, FootnoteRule)
108
+ replace_punctuation_in_brackets(@clean_text)
109
+ @clean_text.apply(InlineFormattingRule)
112
110
  clean_quotations(@clean_text)
113
111
  clean_table_of_contents(@clean_text)
114
112
  check_for_no_space_in_between_sentences(@clean_text)
@@ -126,6 +124,12 @@ module PragmaticSegmenter
126
124
  txt
127
125
  end
128
126
 
127
+ def replace_punctuation_in_brackets(txt)
128
+ txt.dup.gsub!(/\[(?:[^\]])*\]/) do |match|
129
+ txt.gsub!(/#{Regexp.escape(match)}/, "#{match.dup.gsub!(/\?/, '&ᓷ&')}") if match.include?('?')
130
+ end
131
+ end
132
+
129
133
  def search_for_connected_sentences(word, txt, regex, rule)
130
134
  if word =~ regex
131
135
  unless URL_EMAIL_KEYWORDS.any? { |web| word =~ /#{web}/ }
@@ -19,7 +19,7 @@ module PragmaticSegmenter
19
19
 
20
20
  class Abbreviation < PragmaticSegmenter::Abbreviation
21
21
  ABBREVIATIONS = ['a.c', 'a/c', 'abr', 'adj', 'admón', 'afmo', 'ago', 'almte', 'ap', 'apdo', 'arq', 'art', 'atte', 'av', 'avda', 'bco', 'bibl', 'bs. as', 'c', 'c.f', 'c.g', 'c/c', 'c/u', 'cap', 'cc.aa', 'cdad', 'cm', 'co', 'cra', 'cta', 'cv', 'd.e.p', 'da', 'dcha', 'dcho', 'dep', 'dic', 'dicc', 'dir', 'dn', 'doc', 'dom', 'dpto', 'dr', 'dra', 'dto', 'ee', 'ej', 'en', 'entlo', 'esq', 'etc', 'excmo', 'ext', 'f.c', 'fca', 'fdo', 'febr', 'ff. aa', 'ff.cc', 'fig', 'fil', 'fra', 'g.p', 'g/p', 'gob', 'gr', 'gral', 'grs', 'hnos', 'hs', 'igl', 'iltre', 'imp', 'impr', 'impto', 'incl', 'ing', 'inst', 'izdo', 'izq', 'izqdo', 'j.c', 'jue', 'jul', 'jun', 'kg', 'km', 'lcdo', 'ldo', 'let', 'lic', 'ltd', 'lun', 'mar', 'may', 'mg', 'min', 'mié', 'mm', 'máx', 'mín', 'mt', 'n. del t', 'n.b', 'no', 'nov', 'ntra. sra', 'núm', 'oct', 'p', 'p.a', 'p.d', 'p.ej', 'p.v.p', 'párrf', 'ppal', 'prev', 'prof', 'prov', 'ptas', 'pts', 'pza', 'pág', 'págs', 'párr', 'q.e.g.e', 'q.e.p.d', 'q.e.s.m', 'reg', 'rep', 'rr. hh', 'rte', 's', 's. a', 's.a.r', 's.e', 's.l', 's.r.c', 's.r.l', 's.s.s', 's/n', 'sdad', 'seg', 'sept', 'sig', 'sr', 'sra', 'sres', 'srta', 'sta', 'sto', 'sáb', 't.v.e', 'tamb', 'tel', 'tfno', 'ud', 'uu', 'uds', 'univ', 'v.b', 'v.e', 'vd', 'vds', 'vid', 'vie', 'vol', 'vs', 'vto', 'a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
22
- PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
22
+ PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
23
23
  NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']
24
24
 
25
25
  def all
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
@@ -923,6 +923,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
923
923
  ps = PragmaticSegmenter::Segmenter.new(text: "[?][footnoteRef:6] This is a footnote.")
924
924
  expect(ps.segment).to eq(["[?][footnoteRef:6] This is a footnote."])
925
925
  end
926
+
927
+ it "correctly segments text #091" do
928
+ ps = PragmaticSegmenter::Segmenter.new(text: "[15: 12:32] [16: firma? 13:28]")
929
+ expect(ps.segment).to eq(["[15: 12:32] [16: firma? 13:28]"])
930
+ end
926
931
  end
927
932
  end
928
933
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias