pragmatic_segmenter 0.3.5 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +10 -1
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +13 -14
- data/lib/pragmatic_segmenter/languages/amharic.rb +4 -0
- data/lib/pragmatic_segmenter/languages/arabic.rb +2 -1
- data/lib/pragmatic_segmenter/languages/armenian.rb +4 -0
- data/lib/pragmatic_segmenter/languages/burmese.rb +4 -0
- data/lib/pragmatic_segmenter/languages/chinese.rb +3 -0
- data/lib/pragmatic_segmenter/languages/common.rb +8 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +7 -1
- data/lib/pragmatic_segmenter/languages/english.rb +7 -0
- data/lib/pragmatic_segmenter/languages/french.rb +4 -0
- data/lib/pragmatic_segmenter/languages/greek.rb +4 -0
- data/lib/pragmatic_segmenter/languages/hindi.rb +4 -0
- data/lib/pragmatic_segmenter/languages/italian.rb +4 -0
- data/lib/pragmatic_segmenter/languages/japanese.rb +4 -0
- data/lib/pragmatic_segmenter/languages/persian.rb +3 -1
- data/lib/pragmatic_segmenter/languages/polish.rb +4 -0
- data/lib/pragmatic_segmenter/languages/russian.rb +3 -1
- data/lib/pragmatic_segmenter/languages/spanish.rb +4 -0
- data/lib/pragmatic_segmenter/languages/urdu.rb +4 -0
- data/lib/pragmatic_segmenter/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f8c68d5563d388488aeacf96083dc2c81191b364
|
4
|
+
data.tar.gz: 60f67ff5dc22c136f389f48ff9ba76350de013df
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3dcc1aa9da843232653928fb1a961f1b9d053aa9556924c4bed109a4c250c32bf1f11ccd69bdef6e6e1f40e3293e14d6274b06bd689cd03fb50c155200f29a98
|
7
|
+
data.tar.gz: 5b0220d3d9645a78025bdd76b9bff39611255de9e8d568beb053d738e42152ada640bd3d973e6f833ef77e5ff66f12ecd7dd1450617062f7d37fb558ff25ad28
|
data/README.md
CHANGED
@@ -817,7 +817,16 @@ To test the relative performance of different segmentation tools and libraries I
|
|
817
817
|
* Add English abbreviations
|
818
818
|
|
819
819
|
**Version 0.3.3**
|
820
|
-
* Fix cleaner bug
|
820
|
+
* Fix cleaner bug
|
821
|
+
|
822
|
+
**Version 0.3.4**
|
823
|
+
* Large refactor
|
824
|
+
|
825
|
+
**Version 0.3.5**
|
826
|
+
* Reduce GC by replacing #gusb with #gsub! where possible
|
827
|
+
|
828
|
+
**Version 0.3.6**
|
829
|
+
* Refactor SENTENCE_STARTERS to each individual language and add SENTENCE_STARTERS for German
|
821
830
|
|
822
831
|
## Contributing
|
823
832
|
|
@@ -5,8 +5,6 @@ module PragmaticSegmenter
|
|
5
5
|
# replaces the periods.
|
6
6
|
class AbbreviationReplacer
|
7
7
|
|
8
|
-
SENTENCE_STARTERS = %w(A Being Did For He How However I In It Millions More She That The There They We What When Where Who Why)
|
9
|
-
|
10
8
|
attr_reader :text
|
11
9
|
def initialize(text:, language: )
|
12
10
|
@text = Text.new(text)
|
@@ -75,18 +73,19 @@ module PragmaticSegmenter
|
|
75
73
|
# and try to cover the words that most often start a
|
76
74
|
# sentence but could never follow one of the abbreviations below.
|
77
75
|
|
78
|
-
SENTENCE_STARTERS.each do |word|
|
79
|
-
|
80
|
-
txt.gsub!(/U
|
81
|
-
txt.gsub!(/U
|
82
|
-
txt.gsub!(/U
|
83
|
-
txt.gsub!(/
|
84
|
-
txt.gsub!(/E
|
85
|
-
txt.gsub!(/U
|
86
|
-
txt.gsub!(/U
|
87
|
-
txt.gsub!(/
|
88
|
-
txt.gsub!(/
|
89
|
-
txt.gsub!(/
|
76
|
+
@language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word|
|
77
|
+
escaped = Regexp.escape(word)
|
78
|
+
txt.gsub!(/U∯S∯\s#{escaped}\s/, "U∯S\.\s#{escaped}\s")
|
79
|
+
txt.gsub!(/U\.S∯\s#{escaped}\s/, "U\.S\.\s#{escaped}\s")
|
80
|
+
txt.gsub!(/U∯K∯\s#{escaped}\s/, "U∯K\.\s#{escaped}\s")
|
81
|
+
txt.gsub!(/U\.K∯\s#{escaped}\s/, "U\.K\.\s#{escaped}\s")
|
82
|
+
txt.gsub!(/E∯U∯\s#{escaped}\s/, "E∯U\.\s#{escaped}\s")
|
83
|
+
txt.gsub!(/E\.U∯\s#{escaped}\s/, "E\.U\.\s#{escaped}\s")
|
84
|
+
txt.gsub!(/U∯S∯A∯\s#{escaped}\s/, "U∯S∯A\.\s#{escaped}\s")
|
85
|
+
txt.gsub!(/U\.S\.A∯\s#{escaped}\s/, "U\.S\.A\.\s#{escaped}\s")
|
86
|
+
txt.gsub!(/I∯\s#{escaped}\s/, "I\.\s#{escaped}\s")
|
87
|
+
txt.gsub!(/i.v∯\s#{escaped}\s/, "i\.v\.\s#{escaped}\s")
|
88
|
+
txt.gsub!(/I.V∯\s#{escaped}\s/, "I\.V\.\s#{escaped}\s")
|
90
89
|
end
|
91
90
|
txt
|
92
91
|
end
|
@@ -18,7 +18,8 @@ module PragmaticSegmenter
|
|
18
18
|
# Rubular: http://rubular.com/r/kPRgApNHUg
|
19
19
|
ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
|
20
20
|
|
21
|
-
class AbbreviationReplacer
|
21
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
22
|
+
SENTENCE_STARTERS = [].freeze
|
22
23
|
private
|
23
24
|
|
24
25
|
def scan_for_replacements(txt, am, index, character_array)
|
@@ -97,6 +97,14 @@ module PragmaticSegmenter
|
|
97
97
|
ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
|
98
98
|
|
99
99
|
SubSingleQuoteRule = Rule.new(/&⎋&/, "'")
|
100
|
+
|
101
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
102
|
+
SENTENCE_STARTERS = %w(
|
103
|
+
A Being Did For He How However I In It Millions More She That The
|
104
|
+
There They We What When Where Who Why
|
105
|
+
).freeze
|
106
|
+
end
|
107
|
+
|
100
108
|
end
|
101
109
|
end
|
102
110
|
end
|
@@ -58,7 +58,13 @@ module PragmaticSegmenter
|
|
58
58
|
end
|
59
59
|
end
|
60
60
|
|
61
|
-
class AbbreviationReplacer
|
61
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
62
|
+
|
63
|
+
SENTENCE_STARTERS = %w(
|
64
|
+
Am Auch Auf Bei Da Das Der Die Ein Eine Es Für Heute Ich Im In
|
65
|
+
Ist Jetzt Mein Mit Nach So Und Warum Was Wenn Wer Wie Wir
|
66
|
+
).freeze
|
67
|
+
|
62
68
|
def replace
|
63
69
|
@text = text.apply(
|
64
70
|
@language::PossessiveAbbreviationRule,
|
@@ -19,6 +19,13 @@ module PragmaticSegmenter
|
|
19
19
|
[]
|
20
20
|
end
|
21
21
|
end
|
22
|
+
|
23
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
24
|
+
SENTENCE_STARTERS = %w(
|
25
|
+
A Being Did For He How However I In It Millions More She That The
|
26
|
+
There They We What When Where Who Why
|
27
|
+
).freeze
|
28
|
+
end
|
22
29
|
end
|
23
30
|
end
|
24
31
|
end
|
@@ -8,6 +8,10 @@ module PragmaticSegmenter
|
|
8
8
|
PREPOSITIVE_ABBREVIATIONS = ['a.c', 'acc', 'adj', 'adm', 'adv', 'all', 'amn', 'arch', 'asst', 'avv', 'banc', 'bart', 'bcc', 'bldg', 'brig', 'bros', 'c.a', 'c.a.p', 'c.c.p', 'c.m', 'c.p', 'c.p', 'c.s', 'c.v', 'capt', 'cc', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'corr', 'cpl', 'dir', 'dott', 'dott', 'dr', 'dr', 'drs', 'e.p.c', 'ecc', 'egr', 'ens', 'es', 'fatt', 'gen', 'geom', 'gg', 'gov', 'hon', 'hosp', 'hr', 'id', 'ing', 'insp', 'int', "l'avv", "l'ing", 'lett', 'lt', 'maj', 'messrs', 'mlle', 'mm', 'mme', 'mo', 'mons', 'mr', 'mr', 'mrs', 'mrs', 'ms', 'ms', 'msgr', 'n.b', 'ogg', 'on', 'op', 'ord', 'p.c', 'p.c.c', 'p.es', 'p.f', 'p.r', 'p.s', 'p.t', 'p.v', 'pfc', 'ph', 'post', 'pp', 'prof', 'psicol', 'pvt', 'racc', 'rag', 'rep', 'reps', 'res', 'rev', 'ric', 'rif', 'rp', 'rsvp', 'rt', 's.a', 's.b.f', 's.n.c', 's.p.a', 's.p.m', 's.r.l', 'seg', 'sen', 'sens', 'sfc', 'sgg', 'sgt', 'sig', 'sigg', 'soc', 'spett', 'sr', 'ss', 'st', 'supt', 'surg', 'tel', 'u.s', 'v.p', 'v.r', 'v.s']
|
9
9
|
NUMBER_ABBREVIATIONS = ['art', 'no', 'nos', 'nr', 'pp']
|
10
10
|
end
|
11
|
+
|
12
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
13
|
+
SENTENCE_STARTERS = [].freeze
|
14
|
+
end
|
11
15
|
end
|
12
16
|
end
|
13
17
|
end
|
@@ -19,6 +19,10 @@ module PragmaticSegmenter
|
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
23
|
+
SENTENCE_STARTERS = [].freeze
|
24
|
+
end
|
25
|
+
|
22
26
|
class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
|
23
27
|
# Rubular: http://rubular.com/r/GnjOmry5Z2
|
24
28
|
BETWEEN_QUOTE_JA_REGEX = /\u{300c}(?>[^\u{300c}\u{300d}\\]+|\\{2}|\\.)*\u{300d}/
|
@@ -9,7 +9,9 @@ module PragmaticSegmenter
|
|
9
9
|
ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
|
10
10
|
ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
|
11
11
|
|
12
|
-
class AbbreviationReplacer
|
12
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
13
|
+
SENTENCE_STARTERS = [].freeze
|
14
|
+
|
13
15
|
private
|
14
16
|
|
15
17
|
def scan_for_replacements(txt, am, index, character_array)
|
@@ -9,7 +9,9 @@ module PragmaticSegmenter
|
|
9
9
|
NUMBER_ABBREVIATIONS = []
|
10
10
|
end
|
11
11
|
|
12
|
-
class AbbreviationReplacer
|
12
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
13
|
+
SENTENCE_STARTERS = [].freeze
|
14
|
+
|
13
15
|
private
|
14
16
|
|
15
17
|
def replace_period_of_abbr(txt, abbr)
|
@@ -8,6 +8,10 @@ module PragmaticSegmenter
|
|
8
8
|
PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
|
9
9
|
NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']
|
10
10
|
end
|
11
|
+
|
12
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
13
|
+
SENTENCE_STARTERS = [].freeze
|
14
|
+
end
|
11
15
|
end
|
12
16
|
end
|
13
17
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|