pragmatic_segmenter 0.3.5 → 0.3.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 805bb57215b38dc30f107be60c18c141d8dc1297
4
- data.tar.gz: 389bf5f6700f44cb255279a4391c849d0ac67f69
3
+ metadata.gz: f8c68d5563d388488aeacf96083dc2c81191b364
4
+ data.tar.gz: 60f67ff5dc22c136f389f48ff9ba76350de013df
5
5
  SHA512:
6
- metadata.gz: 749c13c831913bcd606f0d08194d09ca982c7e033c59c8520bf1b20a4146934725ab8252bc54362815cd34f5ea45f8c5ed1cde2da80b8beb506093bd86138522
7
- data.tar.gz: d5f05492994604c023fc357b7b030878eb2d8554f2d898efe196530a5777d394359acd4e8c600451cec511df4e2dfafa30cc34d2589a8d983ae3b11299aa9869
6
+ metadata.gz: 3dcc1aa9da843232653928fb1a961f1b9d053aa9556924c4bed109a4c250c32bf1f11ccd69bdef6e6e1f40e3293e14d6274b06bd689cd03fb50c155200f29a98
7
+ data.tar.gz: 5b0220d3d9645a78025bdd76b9bff39611255de9e8d568beb053d738e42152ada640bd3d973e6f833ef77e5ff66f12ecd7dd1450617062f7d37fb558ff25ad28
data/README.md CHANGED
@@ -817,7 +817,16 @@ To test the relative performance of different segmentation tools and libraries I
817
817
  * Add English abbreviations
818
818
 
819
819
  **Version 0.3.3**
820
- * Fix cleaner bug
820
+ * Fix cleaner bug
821
+
822
+ **Version 0.3.4**
823
+ * Large refactor
824
+
825
+ **Version 0.3.5**
826
+ * Reduce GC by replacing #gusb with #gsub! where possible
827
+
828
+ **Version 0.3.6**
829
+ * Refactor SENTENCE_STARTERS to each individual language and add SENTENCE_STARTERS for German
821
830
 
822
831
  ## Contributing
823
832
 
@@ -5,8 +5,6 @@ module PragmaticSegmenter
5
5
  # replaces the periods.
6
6
  class AbbreviationReplacer
7
7
 
8
- SENTENCE_STARTERS = %w(A Being Did For He How However I In It Millions More She That The There They We What When Where Who Why)
9
-
10
8
  attr_reader :text
11
9
  def initialize(text:, language: )
12
10
  @text = Text.new(text)
@@ -75,18 +73,19 @@ module PragmaticSegmenter
75
73
  # and try to cover the words that most often start a
76
74
  # sentence but could never follow one of the abbreviations below.
77
75
 
78
- SENTENCE_STARTERS.each do |word|
79
- txt.gsub!(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
80
- txt.gsub!(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
81
- txt.gsub!(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
82
- txt.gsub!(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
83
- txt.gsub!(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
84
- txt.gsub!(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
85
- txt.gsub!(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
86
- txt.gsub!(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
87
- txt.gsub!(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
88
- txt.gsub!(/i.v∯\s#{Regexp.escape(word)}\s/, "i\.v\.\s#{Regexp.escape(word)}\s")
89
- txt.gsub!(/I.V∯\s#{Regexp.escape(word)}\s/, "I\.V\.\s#{Regexp.escape(word)}\s")
76
+ @language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word|
77
+ escaped = Regexp.escape(word)
78
+ txt.gsub!(/US∯\s#{escaped}\s/, "US\.\s#{escaped}\s")
79
+ txt.gsub!(/U\.S∯\s#{escaped}\s/, "U\.S\.\s#{escaped}\s")
80
+ txt.gsub!(/UK∯\s#{escaped}\s/, "UK\.\s#{escaped}\s")
81
+ txt.gsub!(/U\.K∯\s#{escaped}\s/, "U\.K\.\s#{escaped}\s")
82
+ txt.gsub!(/EU∯\s#{escaped}\s/, "EU\.\s#{escaped}\s")
83
+ txt.gsub!(/E\.U∯\s#{escaped}\s/, "E\.U\.\s#{escaped}\s")
84
+ txt.gsub!(/USA∯\s#{escaped}\s/, "USA\.\s#{escaped}\s")
85
+ txt.gsub!(/U\.S\.A∯\s#{escaped}\s/, "U\.S\.A\.\s#{escaped}\s")
86
+ txt.gsub!(/I∯\s#{escaped}\s/, "I\.\s#{escaped}\s")
87
+ txt.gsub!(/i.v∯\s#{escaped}\s/, "i\.v\.\s#{escaped}\s")
88
+ txt.gsub!(/I.V∯\s#{escaped}\s/, "I\.V\.\s#{escaped}\s")
90
89
  end
91
90
  txt
92
91
  end
@@ -5,6 +5,10 @@ module PragmaticSegmenter
5
5
 
6
6
  SENTENCE_BOUNDARY_REGEX = /.*?[፧።!\?]|.*?$/
7
7
  Punctuations = ['።', '፧', '?', '!']
8
+
9
+ class AbbreviationReplacer < AbbreviationReplacer
10
+ SENTENCE_STARTERS = [].freeze
11
+ end
8
12
  end
9
13
  end
10
14
  end
@@ -18,7 +18,8 @@ module PragmaticSegmenter
18
18
  # Rubular: http://rubular.com/r/kPRgApNHUg
19
19
  ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
20
20
 
21
- class AbbreviationReplacer < AbbreviationReplacer
21
+ class AbbreviationReplacer < AbbreviationReplacer
22
+ SENTENCE_STARTERS = [].freeze
22
23
  private
23
24
 
24
25
  def scan_for_replacements(txt, am, index, character_array)
@@ -5,6 +5,10 @@ module PragmaticSegmenter
5
5
 
6
6
  SENTENCE_BOUNDARY_REGEX = /.*?[։՜:]|.*?$/
7
7
  Punctuations = ['։', '՜', ':']
8
+
9
+ class AbbreviationReplacer < AbbreviationReplacer
10
+ SENTENCE_STARTERS = [].freeze
11
+ end
8
12
  end
9
13
  end
10
14
  end
@@ -5,6 +5,10 @@ module PragmaticSegmenter
5
5
 
6
6
  SENTENCE_BOUNDARY_REGEX = /.*?[။၏!\?]|.*?$/
7
7
  Punctuations = ['။', '၏', '?', '!']
8
+
9
+ class AbbreviationReplacer < AbbreviationReplacer
10
+ SENTENCE_STARTERS = [].freeze
11
+ end
8
12
  end
9
13
  end
10
14
  end
@@ -3,6 +3,9 @@ module PragmaticSegmenter
3
3
  module Chinese
4
4
  include Languages::Common
5
5
 
6
+ class AbbreviationReplacer < AbbreviationReplacer
7
+ SENTENCE_STARTERS = [].freeze
8
+ end
6
9
  end
7
10
  end
8
11
  end
@@ -97,6 +97,14 @@ module PragmaticSegmenter
97
97
  ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
98
98
 
99
99
  SubSingleQuoteRule = Rule.new(/&⎋&/, "'")
100
+
101
+ class AbbreviationReplacer < AbbreviationReplacer
102
+ SENTENCE_STARTERS = %w(
103
+ A Being Did For He How However I In It Millions More She That The
104
+ There They We What When Where Who Why
105
+ ).freeze
106
+ end
107
+
100
108
  end
101
109
  end
102
110
  end
@@ -58,7 +58,13 @@ module PragmaticSegmenter
58
58
  end
59
59
  end
60
60
 
61
- class AbbreviationReplacer < AbbreviationReplacer
61
+ class AbbreviationReplacer < AbbreviationReplacer
62
+
63
+ SENTENCE_STARTERS = %w(
64
+ Am Auch Auf Bei Da Das Der Die Ein Eine Es Für Heute Ich Im In
65
+ Ist Jetzt Mein Mit Nach So Und Warum Was Wenn Wer Wie Wir
66
+ ).freeze
67
+
62
68
  def replace
63
69
  @text = text.apply(
64
70
  @language::PossessiveAbbreviationRule,
@@ -19,6 +19,13 @@ module PragmaticSegmenter
19
19
  []
20
20
  end
21
21
  end
22
+
23
+ class AbbreviationReplacer < AbbreviationReplacer
24
+ SENTENCE_STARTERS = %w(
25
+ A Being Did For He How However I In It Millions More She That The
26
+ There They We What When Where Who Why
27
+ ).freeze
28
+ end
22
29
  end
23
30
  end
24
31
  end
@@ -8,6 +8,10 @@ module PragmaticSegmenter
8
8
  PREPOSITIVE_ABBREVIATIONS = []
9
9
  NUMBER_ABBREVIATIONS = []
10
10
  end
11
+
12
+ class AbbreviationReplacer < AbbreviationReplacer
13
+ SENTENCE_STARTERS = [].freeze
14
+ end
11
15
  end
12
16
  end
13
17
  end
@@ -5,6 +5,10 @@ module PragmaticSegmenter
5
5
 
6
6
  SENTENCE_BOUNDARY_REGEX = /.*?[\.;!\?]|.*?$/
7
7
  Punctuations = ['.', '!', ';', '?']
8
+
9
+ class AbbreviationReplacer < AbbreviationReplacer
10
+ SENTENCE_STARTERS = [].freeze
11
+ end
8
12
  end
9
13
  end
10
14
  end
@@ -5,6 +5,10 @@ module PragmaticSegmenter
5
5
 
6
6
  SENTENCE_BOUNDARY_REGEX = /.*?[।\|!\?]|.*?$/
7
7
  Punctuations = ['।', '|', '.', '!', '?']
8
+
9
+ class AbbreviationReplacer < AbbreviationReplacer
10
+ SENTENCE_STARTERS = [].freeze
11
+ end
8
12
  end
9
13
  end
10
14
  end
@@ -8,6 +8,10 @@ module PragmaticSegmenter
8
8
  PREPOSITIVE_ABBREVIATIONS = ['a.c', 'acc', 'adj', 'adm', 'adv', 'all', 'amn', 'arch', 'asst', 'avv', 'banc', 'bart', 'bcc', 'bldg', 'brig', 'bros', 'c.a', 'c.a.p', 'c.c.p', 'c.m', 'c.p', 'c.p', 'c.s', 'c.v', 'capt', 'cc', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'corr', 'cpl', 'dir', 'dott', 'dott', 'dr', 'dr', 'drs', 'e.p.c', 'ecc', 'egr', 'ens', 'es', 'fatt', 'gen', 'geom', 'gg', 'gov', 'hon', 'hosp', 'hr', 'id', 'ing', 'insp', 'int', "l'avv", "l'ing", 'lett', 'lt', 'maj', 'messrs', 'mlle', 'mm', 'mme', 'mo', 'mons', 'mr', 'mr', 'mrs', 'mrs', 'ms', 'ms', 'msgr', 'n.b', 'ogg', 'on', 'op', 'ord', 'p.c', 'p.c.c', 'p.es', 'p.f', 'p.r', 'p.s', 'p.t', 'p.v', 'pfc', 'ph', 'post', 'pp', 'prof', 'psicol', 'pvt', 'racc', 'rag', 'rep', 'reps', 'res', 'rev', 'ric', 'rif', 'rp', 'rsvp', 'rt', 's.a', 's.b.f', 's.n.c', 's.p.a', 's.p.m', 's.r.l', 'seg', 'sen', 'sens', 'sfc', 'sgg', 'sgt', 'sig', 'sigg', 'soc', 'spett', 'sr', 'ss', 'st', 'supt', 'surg', 'tel', 'u.s', 'v.p', 'v.r', 'v.s']
9
9
  NUMBER_ABBREVIATIONS = ['art', 'no', 'nos', 'nr', 'pp']
10
10
  end
11
+
12
+ class AbbreviationReplacer < AbbreviationReplacer
13
+ SENTENCE_STARTERS = [].freeze
14
+ end
11
15
  end
12
16
  end
13
17
  end
@@ -19,6 +19,10 @@ module PragmaticSegmenter
19
19
  end
20
20
  end
21
21
 
22
+ class AbbreviationReplacer < AbbreviationReplacer
23
+ SENTENCE_STARTERS = [].freeze
24
+ end
25
+
22
26
  class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
23
27
  # Rubular: http://rubular.com/r/GnjOmry5Z2
24
28
  BETWEEN_QUOTE_JA_REGEX = /\u{300c}(?>[^\u{300c}\u{300d}\\]+|\\{2}|\\.)*\u{300d}/
@@ -9,7 +9,9 @@ module PragmaticSegmenter
9
9
  ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
10
10
  ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
11
11
 
12
- class AbbreviationReplacer < AbbreviationReplacer
12
+ class AbbreviationReplacer < AbbreviationReplacer
13
+ SENTENCE_STARTERS = [].freeze
14
+
13
15
  private
14
16
 
15
17
  def scan_for_replacements(txt, am, index, character_array)
@@ -8,6 +8,10 @@ module PragmaticSegmenter
8
8
  PREPOSITIVE_ABBREVIATIONS = []
9
9
  NUMBER_ABBREVIATIONS = []
10
10
  end
11
+
12
+ class AbbreviationReplacer < AbbreviationReplacer
13
+ SENTENCE_STARTERS = [].freeze
14
+ end
11
15
  end
12
16
  end
13
17
  end
@@ -9,7 +9,9 @@ module PragmaticSegmenter
9
9
  NUMBER_ABBREVIATIONS = []
10
10
  end
11
11
 
12
- class AbbreviationReplacer < AbbreviationReplacer
12
+ class AbbreviationReplacer < AbbreviationReplacer
13
+ SENTENCE_STARTERS = [].freeze
14
+
13
15
  private
14
16
 
15
17
  def replace_period_of_abbr(txt, abbr)
@@ -8,6 +8,10 @@ module PragmaticSegmenter
8
8
  PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
9
9
  NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']
10
10
  end
11
+
12
+ class AbbreviationReplacer < AbbreviationReplacer
13
+ SENTENCE_STARTERS = [].freeze
14
+ end
11
15
  end
12
16
  end
13
17
  end
@@ -5,6 +5,10 @@ module PragmaticSegmenter
5
5
 
6
6
  SENTENCE_BOUNDARY_REGEX = /.*?[۔؟!\?]|.*?$/
7
7
  Punctuations = ['?', '!', '۔', '؟']
8
+
9
+ class AbbreviationReplacer < AbbreviationReplacer
10
+ SENTENCE_STARTERS = [].freeze
11
+ end
8
12
  end
9
13
  end
10
14
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.3.5"
2
+ VERSION = "0.3.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-04 00:00:00.000000000 Z
11
+ date: 2016-01-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler