pragmatic_segmenter 0.3.5 → 0.3.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +10 -1
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +13 -14
- data/lib/pragmatic_segmenter/languages/amharic.rb +4 -0
- data/lib/pragmatic_segmenter/languages/arabic.rb +2 -1
- data/lib/pragmatic_segmenter/languages/armenian.rb +4 -0
- data/lib/pragmatic_segmenter/languages/burmese.rb +4 -0
- data/lib/pragmatic_segmenter/languages/chinese.rb +3 -0
- data/lib/pragmatic_segmenter/languages/common.rb +8 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +7 -1
- data/lib/pragmatic_segmenter/languages/english.rb +7 -0
- data/lib/pragmatic_segmenter/languages/french.rb +4 -0
- data/lib/pragmatic_segmenter/languages/greek.rb +4 -0
- data/lib/pragmatic_segmenter/languages/hindi.rb +4 -0
- data/lib/pragmatic_segmenter/languages/italian.rb +4 -0
- data/lib/pragmatic_segmenter/languages/japanese.rb +4 -0
- data/lib/pragmatic_segmenter/languages/persian.rb +3 -1
- data/lib/pragmatic_segmenter/languages/polish.rb +4 -0
- data/lib/pragmatic_segmenter/languages/russian.rb +3 -1
- data/lib/pragmatic_segmenter/languages/spanish.rb +4 -0
- data/lib/pragmatic_segmenter/languages/urdu.rb +4 -0
- data/lib/pragmatic_segmenter/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f8c68d5563d388488aeacf96083dc2c81191b364
|
4
|
+
data.tar.gz: 60f67ff5dc22c136f389f48ff9ba76350de013df
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3dcc1aa9da843232653928fb1a961f1b9d053aa9556924c4bed109a4c250c32bf1f11ccd69bdef6e6e1f40e3293e14d6274b06bd689cd03fb50c155200f29a98
|
7
|
+
data.tar.gz: 5b0220d3d9645a78025bdd76b9bff39611255de9e8d568beb053d738e42152ada640bd3d973e6f833ef77e5ff66f12ecd7dd1450617062f7d37fb558ff25ad28
|
data/README.md
CHANGED
@@ -817,7 +817,16 @@ To test the relative performance of different segmentation tools and libraries I
|
|
817
817
|
* Add English abbreviations
|
818
818
|
|
819
819
|
**Version 0.3.3**
|
820
|
-
* Fix cleaner bug
|
820
|
+
* Fix cleaner bug
|
821
|
+
|
822
|
+
**Version 0.3.4**
|
823
|
+
* Large refactor
|
824
|
+
|
825
|
+
**Version 0.3.5**
|
826
|
+
* Reduce GC by replacing #gusb with #gsub! where possible
|
827
|
+
|
828
|
+
**Version 0.3.6**
|
829
|
+
* Refactor SENTENCE_STARTERS to each individual language and add SENTENCE_STARTERS for German
|
821
830
|
|
822
831
|
## Contributing
|
823
832
|
|
@@ -5,8 +5,6 @@ module PragmaticSegmenter
|
|
5
5
|
# replaces the periods.
|
6
6
|
class AbbreviationReplacer
|
7
7
|
|
8
|
-
SENTENCE_STARTERS = %w(A Being Did For He How However I In It Millions More She That The There They We What When Where Who Why)
|
9
|
-
|
10
8
|
attr_reader :text
|
11
9
|
def initialize(text:, language: )
|
12
10
|
@text = Text.new(text)
|
@@ -75,18 +73,19 @@ module PragmaticSegmenter
|
|
75
73
|
# and try to cover the words that most often start a
|
76
74
|
# sentence but could never follow one of the abbreviations below.
|
77
75
|
|
78
|
-
SENTENCE_STARTERS.each do |word|
|
79
|
-
|
80
|
-
txt.gsub!(/U
|
81
|
-
txt.gsub!(/U
|
82
|
-
txt.gsub!(/U
|
83
|
-
txt.gsub!(/
|
84
|
-
txt.gsub!(/E
|
85
|
-
txt.gsub!(/U
|
86
|
-
txt.gsub!(/U
|
87
|
-
txt.gsub!(/
|
88
|
-
txt.gsub!(/
|
89
|
-
txt.gsub!(/
|
76
|
+
@language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word|
|
77
|
+
escaped = Regexp.escape(word)
|
78
|
+
txt.gsub!(/U∯S∯\s#{escaped}\s/, "U∯S\.\s#{escaped}\s")
|
79
|
+
txt.gsub!(/U\.S∯\s#{escaped}\s/, "U\.S\.\s#{escaped}\s")
|
80
|
+
txt.gsub!(/U∯K∯\s#{escaped}\s/, "U∯K\.\s#{escaped}\s")
|
81
|
+
txt.gsub!(/U\.K∯\s#{escaped}\s/, "U\.K\.\s#{escaped}\s")
|
82
|
+
txt.gsub!(/E∯U∯\s#{escaped}\s/, "E∯U\.\s#{escaped}\s")
|
83
|
+
txt.gsub!(/E\.U∯\s#{escaped}\s/, "E\.U\.\s#{escaped}\s")
|
84
|
+
txt.gsub!(/U∯S∯A∯\s#{escaped}\s/, "U∯S∯A\.\s#{escaped}\s")
|
85
|
+
txt.gsub!(/U\.S\.A∯\s#{escaped}\s/, "U\.S\.A\.\s#{escaped}\s")
|
86
|
+
txt.gsub!(/I∯\s#{escaped}\s/, "I\.\s#{escaped}\s")
|
87
|
+
txt.gsub!(/i.v∯\s#{escaped}\s/, "i\.v\.\s#{escaped}\s")
|
88
|
+
txt.gsub!(/I.V∯\s#{escaped}\s/, "I\.V\.\s#{escaped}\s")
|
90
89
|
end
|
91
90
|
txt
|
92
91
|
end
|
@@ -18,7 +18,8 @@ module PragmaticSegmenter
|
|
18
18
|
# Rubular: http://rubular.com/r/kPRgApNHUg
|
19
19
|
ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
|
20
20
|
|
21
|
-
class AbbreviationReplacer
|
21
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
22
|
+
SENTENCE_STARTERS = [].freeze
|
22
23
|
private
|
23
24
|
|
24
25
|
def scan_for_replacements(txt, am, index, character_array)
|
@@ -97,6 +97,14 @@ module PragmaticSegmenter
|
|
97
97
|
ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
|
98
98
|
|
99
99
|
SubSingleQuoteRule = Rule.new(/&⎋&/, "'")
|
100
|
+
|
101
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
102
|
+
SENTENCE_STARTERS = %w(
|
103
|
+
A Being Did For He How However I In It Millions More She That The
|
104
|
+
There They We What When Where Who Why
|
105
|
+
).freeze
|
106
|
+
end
|
107
|
+
|
100
108
|
end
|
101
109
|
end
|
102
110
|
end
|
@@ -58,7 +58,13 @@ module PragmaticSegmenter
|
|
58
58
|
end
|
59
59
|
end
|
60
60
|
|
61
|
-
class AbbreviationReplacer
|
61
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
62
|
+
|
63
|
+
SENTENCE_STARTERS = %w(
|
64
|
+
Am Auch Auf Bei Da Das Der Die Ein Eine Es Für Heute Ich Im In
|
65
|
+
Ist Jetzt Mein Mit Nach So Und Warum Was Wenn Wer Wie Wir
|
66
|
+
).freeze
|
67
|
+
|
62
68
|
def replace
|
63
69
|
@text = text.apply(
|
64
70
|
@language::PossessiveAbbreviationRule,
|
@@ -19,6 +19,13 @@ module PragmaticSegmenter
|
|
19
19
|
[]
|
20
20
|
end
|
21
21
|
end
|
22
|
+
|
23
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
24
|
+
SENTENCE_STARTERS = %w(
|
25
|
+
A Being Did For He How However I In It Millions More She That The
|
26
|
+
There They We What When Where Who Why
|
27
|
+
).freeze
|
28
|
+
end
|
22
29
|
end
|
23
30
|
end
|
24
31
|
end
|
@@ -8,6 +8,10 @@ module PragmaticSegmenter
|
|
8
8
|
PREPOSITIVE_ABBREVIATIONS = ['a.c', 'acc', 'adj', 'adm', 'adv', 'all', 'amn', 'arch', 'asst', 'avv', 'banc', 'bart', 'bcc', 'bldg', 'brig', 'bros', 'c.a', 'c.a.p', 'c.c.p', 'c.m', 'c.p', 'c.p', 'c.s', 'c.v', 'capt', 'cc', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'corr', 'cpl', 'dir', 'dott', 'dott', 'dr', 'dr', 'drs', 'e.p.c', 'ecc', 'egr', 'ens', 'es', 'fatt', 'gen', 'geom', 'gg', 'gov', 'hon', 'hosp', 'hr', 'id', 'ing', 'insp', 'int', "l'avv", "l'ing", 'lett', 'lt', 'maj', 'messrs', 'mlle', 'mm', 'mme', 'mo', 'mons', 'mr', 'mr', 'mrs', 'mrs', 'ms', 'ms', 'msgr', 'n.b', 'ogg', 'on', 'op', 'ord', 'p.c', 'p.c.c', 'p.es', 'p.f', 'p.r', 'p.s', 'p.t', 'p.v', 'pfc', 'ph', 'post', 'pp', 'prof', 'psicol', 'pvt', 'racc', 'rag', 'rep', 'reps', 'res', 'rev', 'ric', 'rif', 'rp', 'rsvp', 'rt', 's.a', 's.b.f', 's.n.c', 's.p.a', 's.p.m', 's.r.l', 'seg', 'sen', 'sens', 'sfc', 'sgg', 'sgt', 'sig', 'sigg', 'soc', 'spett', 'sr', 'ss', 'st', 'supt', 'surg', 'tel', 'u.s', 'v.p', 'v.r', 'v.s']
|
9
9
|
NUMBER_ABBREVIATIONS = ['art', 'no', 'nos', 'nr', 'pp']
|
10
10
|
end
|
11
|
+
|
12
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
13
|
+
SENTENCE_STARTERS = [].freeze
|
14
|
+
end
|
11
15
|
end
|
12
16
|
end
|
13
17
|
end
|
@@ -19,6 +19,10 @@ module PragmaticSegmenter
|
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
23
|
+
SENTENCE_STARTERS = [].freeze
|
24
|
+
end
|
25
|
+
|
22
26
|
class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
|
23
27
|
# Rubular: http://rubular.com/r/GnjOmry5Z2
|
24
28
|
BETWEEN_QUOTE_JA_REGEX = /\u{300c}(?>[^\u{300c}\u{300d}\\]+|\\{2}|\\.)*\u{300d}/
|
@@ -9,7 +9,9 @@ module PragmaticSegmenter
|
|
9
9
|
ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
|
10
10
|
ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
|
11
11
|
|
12
|
-
class AbbreviationReplacer
|
12
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
13
|
+
SENTENCE_STARTERS = [].freeze
|
14
|
+
|
13
15
|
private
|
14
16
|
|
15
17
|
def scan_for_replacements(txt, am, index, character_array)
|
@@ -9,7 +9,9 @@ module PragmaticSegmenter
|
|
9
9
|
NUMBER_ABBREVIATIONS = []
|
10
10
|
end
|
11
11
|
|
12
|
-
class AbbreviationReplacer
|
12
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
13
|
+
SENTENCE_STARTERS = [].freeze
|
14
|
+
|
13
15
|
private
|
14
16
|
|
15
17
|
def replace_period_of_abbr(txt, abbr)
|
@@ -8,6 +8,10 @@ module PragmaticSegmenter
|
|
8
8
|
PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
|
9
9
|
NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']
|
10
10
|
end
|
11
|
+
|
12
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
13
|
+
SENTENCE_STARTERS = [].freeze
|
14
|
+
end
|
11
15
|
end
|
12
16
|
end
|
13
17
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|