pragmatic_segmenter 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +7 -4
- data/lib/pragmatic_segmenter/cleaner.rb +8 -4
- data/lib/pragmatic_segmenter/languages/spanish.rb +1 -1
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +5 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ebda4e8fba93e6cc9fbaeb57fe71e0d3ae47d721
|
4
|
+
data.tar.gz: 3b2e121d592797dea12efc587c198082a4a23955
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 169f112a82005dcd9b399b8337ef0e25afffde10f6a30f4c038f7621550dd3696c4ad054725763734d420de6445577946c8fc6c249e1d0ff13f5788adde0b9b9
|
7
|
+
data.tar.gz: 9b4b51797b69972e30adc23f948de2a26bc9a9e160ce1271c8f869055e466be83e451bc36eed44020d1e6ee903922d3594ef254e01df337ee884ca80a2a72965
|
data/README.md
CHANGED
@@ -754,6 +754,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
754
754
|
* Fix missing abbreviations
|
755
755
|
* Add footnote rule to `cleaner.rb`
|
756
756
|
|
757
|
+
**Version 0.1.3**
|
758
|
+
* Improve punctuation in bracket replacement
|
759
|
+
|
757
760
|
## Contributing
|
758
761
|
|
759
762
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -136,16 +136,19 @@ module PragmaticSegmenter
|
|
136
136
|
end
|
137
137
|
|
138
138
|
def replace_pre_number_abbr(txt, abbr)
|
139
|
-
txt.gsub(/(
|
139
|
+
txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
|
140
|
+
.gsub(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
|
141
|
+
|
140
142
|
end
|
141
143
|
|
142
144
|
def replace_prepositive_abbr(txt, abbr)
|
143
|
-
txt.gsub(/(
|
145
|
+
txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
|
146
|
+
.gsub(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
|
144
147
|
end
|
145
148
|
|
146
149
|
def replace_period_of_abbr(txt, abbr)
|
147
|
-
txt.gsub(/(
|
148
|
-
.gsub(/(
|
150
|
+
txt.gsub(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
|
151
|
+
.gsub(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
|
149
152
|
end
|
150
153
|
|
151
154
|
def replace_possessive_abbreviations(txt)
|
@@ -61,9 +61,6 @@ module PragmaticSegmenter
|
|
61
61
|
# Rubular: http://rubular.com/r/IQ4TPfsbd8
|
62
62
|
ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
|
63
63
|
|
64
|
-
# Rubular: http://rubular.com/r/gEjxQ0HmSD
|
65
|
-
FootnoteRule = Rule.new(/\[\?\]/, '[&ᓷ&]')
|
66
|
-
|
67
64
|
# Rubular: http://rubular.com/r/6dt98uI76u
|
68
65
|
NoSpaceBetweenSentencesRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
|
69
66
|
|
@@ -108,7 +105,8 @@ module PragmaticSegmenter
|
|
108
105
|
replace_newlines(@clean_text)
|
109
106
|
replace_escaped_newlines(@clean_text)
|
110
107
|
@clean_text.apply(HtmlRules::All)
|
111
|
-
@clean_text
|
108
|
+
replace_punctuation_in_brackets(@clean_text)
|
109
|
+
@clean_text.apply(InlineFormattingRule)
|
112
110
|
clean_quotations(@clean_text)
|
113
111
|
clean_table_of_contents(@clean_text)
|
114
112
|
check_for_no_space_in_between_sentences(@clean_text)
|
@@ -126,6 +124,12 @@ module PragmaticSegmenter
|
|
126
124
|
txt
|
127
125
|
end
|
128
126
|
|
127
|
+
def replace_punctuation_in_brackets(txt)
|
128
|
+
txt.dup.gsub!(/\[(?:[^\]])*\]/) do |match|
|
129
|
+
txt.gsub!(/#{Regexp.escape(match)}/, "#{match.dup.gsub!(/\?/, '&ᓷ&')}") if match.include?('?')
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
129
133
|
def search_for_connected_sentences(word, txt, regex, rule)
|
130
134
|
if word =~ regex
|
131
135
|
unless URL_EMAIL_KEYWORDS.any? { |web| word =~ /#{web}/ }
|
@@ -19,7 +19,7 @@ module PragmaticSegmenter
|
|
19
19
|
|
20
20
|
class Abbreviation < PragmaticSegmenter::Abbreviation
|
21
21
|
ABBREVIATIONS = ['a.c', 'a/c', 'abr', 'adj', 'admón', 'afmo', 'ago', 'almte', 'ap', 'apdo', 'arq', 'art', 'atte', 'av', 'avda', 'bco', 'bibl', 'bs. as', 'c', 'c.f', 'c.g', 'c/c', 'c/u', 'cap', 'cc.aa', 'cdad', 'cm', 'co', 'cra', 'cta', 'cv', 'd.e.p', 'da', 'dcha', 'dcho', 'dep', 'dic', 'dicc', 'dir', 'dn', 'doc', 'dom', 'dpto', 'dr', 'dra', 'dto', 'ee', 'ej', 'en', 'entlo', 'esq', 'etc', 'excmo', 'ext', 'f.c', 'fca', 'fdo', 'febr', 'ff. aa', 'ff.cc', 'fig', 'fil', 'fra', 'g.p', 'g/p', 'gob', 'gr', 'gral', 'grs', 'hnos', 'hs', 'igl', 'iltre', 'imp', 'impr', 'impto', 'incl', 'ing', 'inst', 'izdo', 'izq', 'izqdo', 'j.c', 'jue', 'jul', 'jun', 'kg', 'km', 'lcdo', 'ldo', 'let', 'lic', 'ltd', 'lun', 'mar', 'may', 'mg', 'min', 'mié', 'mm', 'máx', 'mín', 'mt', 'n. del t', 'n.b', 'no', 'nov', 'ntra. sra', 'núm', 'oct', 'p', 'p.a', 'p.d', 'p.ej', 'p.v.p', 'párrf', 'ppal', 'prev', 'prof', 'prov', 'ptas', 'pts', 'pza', 'pág', 'págs', 'párr', 'q.e.g.e', 'q.e.p.d', 'q.e.s.m', 'reg', 'rep', 'rr. hh', 'rte', 's', 's. a', 's.a.r', 's.e', 's.l', 's.r.c', 's.r.l', 's.s.s', 's/n', 'sdad', 'seg', 'sept', 'sig', 'sr', 'sra', 'sres', 'srta', 'sta', 'sto', 'sáb', 't.v.e', 'tamb', 'tel', 'tfno', 'ud', 'uu', 'uds', 'univ', 'v.b', 'v.e', 'vd', 'vds', 'vid', 'vie', 'vol', 'vs', 'vto', 'a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
|
22
|
-
PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
|
22
|
+
PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
|
23
23
|
NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']
|
24
24
|
|
25
25
|
def all
|
@@ -923,6 +923,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
923
923
|
ps = PragmaticSegmenter::Segmenter.new(text: "[?][footnoteRef:6] This is a footnote.")
|
924
924
|
expect(ps.segment).to eq(["[?][footnoteRef:6] This is a footnote."])
|
925
925
|
end
|
926
|
+
|
927
|
+
it "correctly segments text #091" do
|
928
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "[15: 12:32] [16: firma? 13:28]")
|
929
|
+
expect(ps.segment).to eq(["[15: 12:32] [16: firma? 13:28]"])
|
930
|
+
end
|
926
931
|
end
|
927
932
|
end
|
928
933
|
|