pragmatic_segmenter 0.3.5 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 805bb57215b38dc30f107be60c18c141d8dc1297
4
- data.tar.gz: 389bf5f6700f44cb255279a4391c849d0ac67f69
3
+ metadata.gz: f8c68d5563d388488aeacf96083dc2c81191b364
4
+ data.tar.gz: 60f67ff5dc22c136f389f48ff9ba76350de013df
5
5
  SHA512:
6
- metadata.gz: 749c13c831913bcd606f0d08194d09ca982c7e033c59c8520bf1b20a4146934725ab8252bc54362815cd34f5ea45f8c5ed1cde2da80b8beb506093bd86138522
7
- data.tar.gz: d5f05492994604c023fc357b7b030878eb2d8554f2d898efe196530a5777d394359acd4e8c600451cec511df4e2dfafa30cc34d2589a8d983ae3b11299aa9869
6
+ metadata.gz: 3dcc1aa9da843232653928fb1a961f1b9d053aa9556924c4bed109a4c250c32bf1f11ccd69bdef6e6e1f40e3293e14d6274b06bd689cd03fb50c155200f29a98
7
+ data.tar.gz: 5b0220d3d9645a78025bdd76b9bff39611255de9e8d568beb053d738e42152ada640bd3d973e6f833ef77e5ff66f12ecd7dd1450617062f7d37fb558ff25ad28
data/README.md CHANGED
@@ -817,7 +817,16 @@ To test the relative performance of different segmentation tools and libraries I
817
817
  * Add English abbreviations
818
818
 
819
819
  **Version 0.3.3**
820
- * Fix cleaner bug
820
+ * Fix cleaner bug
821
+
822
+ **Version 0.3.4**
823
+ * Large refactor
824
+
825
+ **Version 0.3.5**
826
+ * Reduce GC by replacing #gusb with #gsub! where possible
827
+
828
+ **Version 0.3.6**
829
+ * Refactor SENTENCE_STARTERS to each individual language and add SENTENCE_STARTERS for German
821
830
 
822
831
  ## Contributing
823
832
 
@@ -5,8 +5,6 @@ module PragmaticSegmenter
5
5
  # replaces the periods.
6
6
  class AbbreviationReplacer
7
7
 
8
- SENTENCE_STARTERS = %w(A Being Did For He How However I In It Millions More She That The There They We What When Where Who Why)
9
-
10
8
  attr_reader :text
11
9
  def initialize(text:, language: )
12
10
  @text = Text.new(text)
@@ -75,18 +73,19 @@ module PragmaticSegmenter
75
73
  # and try to cover the words that most often start a
76
74
  # sentence but could never follow one of the abbreviations below.
77
75
 
78
- SENTENCE_STARTERS.each do |word|
79
- txt.gsub!(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
80
- txt.gsub!(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
81
- txt.gsub!(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
82
- txt.gsub!(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
83
- txt.gsub!(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
84
- txt.gsub!(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
85
- txt.gsub!(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
86
- txt.gsub!(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
87
- txt.gsub!(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
88
- txt.gsub!(/i.v∯\s#{Regexp.escape(word)}\s/, "i\.v\.\s#{Regexp.escape(word)}\s")
89
- txt.gsub!(/I.V∯\s#{Regexp.escape(word)}\s/, "I\.V\.\s#{Regexp.escape(word)}\s")
76
+ @language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word|
77
+ escaped = Regexp.escape(word)
78
+ txt.gsub!(/US∯\s#{escaped}\s/, "US\.\s#{escaped}\s")
79
+ txt.gsub!(/U\.S∯\s#{escaped}\s/, "U\.S\.\s#{escaped}\s")
80
+ txt.gsub!(/UK∯\s#{escaped}\s/, "UK\.\s#{escaped}\s")
81
+ txt.gsub!(/U\.K∯\s#{escaped}\s/, "U\.K\.\s#{escaped}\s")
82
+ txt.gsub!(/EU∯\s#{escaped}\s/, "EU\.\s#{escaped}\s")
83
+ txt.gsub!(/E\.U∯\s#{escaped}\s/, "E\.U\.\s#{escaped}\s")
84
+ txt.gsub!(/USA∯\s#{escaped}\s/, "USA\.\s#{escaped}\s")
85
+ txt.gsub!(/U\.S\.A∯\s#{escaped}\s/, "U\.S\.A\.\s#{escaped}\s")
86
+ txt.gsub!(/I∯\s#{escaped}\s/, "I\.\s#{escaped}\s")
87
+ txt.gsub!(/i.v∯\s#{escaped}\s/, "i\.v\.\s#{escaped}\s")
88
+ txt.gsub!(/I.V∯\s#{escaped}\s/, "I\.V\.\s#{escaped}\s")
90
89
  end
91
90
  txt
92
91
  end
@@ -5,6 +5,10 @@ module PragmaticSegmenter
5
5
 
6
6
  SENTENCE_BOUNDARY_REGEX = /.*?[፧።!\?]|.*?$/
7
7
  Punctuations = ['።', '፧', '?', '!']
8
+
9
+ class AbbreviationReplacer < AbbreviationReplacer
10
+ SENTENCE_STARTERS = [].freeze
11
+ end
8
12
  end
9
13
  end
10
14
  end
@@ -18,7 +18,8 @@ module PragmaticSegmenter
18
18
  # Rubular: http://rubular.com/r/kPRgApNHUg
19
19
  ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
20
20
 
21
- class AbbreviationReplacer < AbbreviationReplacer
21
+ class AbbreviationReplacer < AbbreviationReplacer
22
+ SENTENCE_STARTERS = [].freeze
22
23
  private
23
24
 
24
25
  def scan_for_replacements(txt, am, index, character_array)
@@ -5,6 +5,10 @@ module PragmaticSegmenter
5
5
 
6
6
  SENTENCE_BOUNDARY_REGEX = /.*?[։՜:]|.*?$/
7
7
  Punctuations = ['։', '՜', ':']
8
+
9
+ class AbbreviationReplacer < AbbreviationReplacer
10
+ SENTENCE_STARTERS = [].freeze
11
+ end
8
12
  end
9
13
  end
10
14
  end
@@ -5,6 +5,10 @@ module PragmaticSegmenter
5
5
 
6
6
  SENTENCE_BOUNDARY_REGEX = /.*?[။၏!\?]|.*?$/
7
7
  Punctuations = ['။', '၏', '?', '!']
8
+
9
+ class AbbreviationReplacer < AbbreviationReplacer
10
+ SENTENCE_STARTERS = [].freeze
11
+ end
8
12
  end
9
13
  end
10
14
  end
@@ -3,6 +3,9 @@ module PragmaticSegmenter
3
3
  module Chinese
4
4
  include Languages::Common
5
5
 
6
+ class AbbreviationReplacer < AbbreviationReplacer
7
+ SENTENCE_STARTERS = [].freeze
8
+ end
6
9
  end
7
10
  end
8
11
  end
@@ -97,6 +97,14 @@ module PragmaticSegmenter
97
97
  ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
98
98
 
99
99
  SubSingleQuoteRule = Rule.new(/&⎋&/, "'")
100
+
101
+ class AbbreviationReplacer < AbbreviationReplacer
102
+ SENTENCE_STARTERS = %w(
103
+ A Being Did For He How However I In It Millions More She That The
104
+ There They We What When Where Who Why
105
+ ).freeze
106
+ end
107
+
100
108
  end
101
109
  end
102
110
  end
@@ -58,7 +58,13 @@ module PragmaticSegmenter
58
58
  end
59
59
  end
60
60
 
61
- class AbbreviationReplacer < AbbreviationReplacer
61
+ class AbbreviationReplacer < AbbreviationReplacer
62
+
63
+ SENTENCE_STARTERS = %w(
64
+ Am Auch Auf Bei Da Das Der Die Ein Eine Es Für Heute Ich Im In
65
+ Ist Jetzt Mein Mit Nach So Und Warum Was Wenn Wer Wie Wir
66
+ ).freeze
67
+
62
68
  def replace
63
69
  @text = text.apply(
64
70
  @language::PossessiveAbbreviationRule,
@@ -19,6 +19,13 @@ module PragmaticSegmenter
19
19
  []
20
20
  end
21
21
  end
22
+
23
+ class AbbreviationReplacer < AbbreviationReplacer
24
+ SENTENCE_STARTERS = %w(
25
+ A Being Did For He How However I In It Millions More She That The
26
+ There They We What When Where Who Why
27
+ ).freeze
28
+ end
22
29
  end
23
30
  end
24
31
  end
@@ -8,6 +8,10 @@ module PragmaticSegmenter
8
8
  PREPOSITIVE_ABBREVIATIONS = []
9
9
  NUMBER_ABBREVIATIONS = []
10
10
  end
11
+
12
+ class AbbreviationReplacer < AbbreviationReplacer
13
+ SENTENCE_STARTERS = [].freeze
14
+ end
11
15
  end
12
16
  end
13
17
  end
@@ -5,6 +5,10 @@ module PragmaticSegmenter
5
5
 
6
6
  SENTENCE_BOUNDARY_REGEX = /.*?[\.;!\?]|.*?$/
7
7
  Punctuations = ['.', '!', ';', '?']
8
+
9
+ class AbbreviationReplacer < AbbreviationReplacer
10
+ SENTENCE_STARTERS = [].freeze
11
+ end
8
12
  end
9
13
  end
10
14
  end
@@ -5,6 +5,10 @@ module PragmaticSegmenter
5
5
 
6
6
  SENTENCE_BOUNDARY_REGEX = /.*?[।\|!\?]|.*?$/
7
7
  Punctuations = ['।', '|', '.', '!', '?']
8
+
9
+ class AbbreviationReplacer < AbbreviationReplacer
10
+ SENTENCE_STARTERS = [].freeze
11
+ end
8
12
  end
9
13
  end
10
14
  end
@@ -8,6 +8,10 @@ module PragmaticSegmenter
8
8
  PREPOSITIVE_ABBREVIATIONS = ['a.c', 'acc', 'adj', 'adm', 'adv', 'all', 'amn', 'arch', 'asst', 'avv', 'banc', 'bart', 'bcc', 'bldg', 'brig', 'bros', 'c.a', 'c.a.p', 'c.c.p', 'c.m', 'c.p', 'c.p', 'c.s', 'c.v', 'capt', 'cc', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'corr', 'cpl', 'dir', 'dott', 'dott', 'dr', 'dr', 'drs', 'e.p.c', 'ecc', 'egr', 'ens', 'es', 'fatt', 'gen', 'geom', 'gg', 'gov', 'hon', 'hosp', 'hr', 'id', 'ing', 'insp', 'int', "l'avv", "l'ing", 'lett', 'lt', 'maj', 'messrs', 'mlle', 'mm', 'mme', 'mo', 'mons', 'mr', 'mr', 'mrs', 'mrs', 'ms', 'ms', 'msgr', 'n.b', 'ogg', 'on', 'op', 'ord', 'p.c', 'p.c.c', 'p.es', 'p.f', 'p.r', 'p.s', 'p.t', 'p.v', 'pfc', 'ph', 'post', 'pp', 'prof', 'psicol', 'pvt', 'racc', 'rag', 'rep', 'reps', 'res', 'rev', 'ric', 'rif', 'rp', 'rsvp', 'rt', 's.a', 's.b.f', 's.n.c', 's.p.a', 's.p.m', 's.r.l', 'seg', 'sen', 'sens', 'sfc', 'sgg', 'sgt', 'sig', 'sigg', 'soc', 'spett', 'sr', 'ss', 'st', 'supt', 'surg', 'tel', 'u.s', 'v.p', 'v.r', 'v.s']
9
9
  NUMBER_ABBREVIATIONS = ['art', 'no', 'nos', 'nr', 'pp']
10
10
  end
11
+
12
+ class AbbreviationReplacer < AbbreviationReplacer
13
+ SENTENCE_STARTERS = [].freeze
14
+ end
11
15
  end
12
16
  end
13
17
  end
@@ -19,6 +19,10 @@ module PragmaticSegmenter
19
19
  end
20
20
  end
21
21
 
22
+ class AbbreviationReplacer < AbbreviationReplacer
23
+ SENTENCE_STARTERS = [].freeze
24
+ end
25
+
22
26
  class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
23
27
  # Rubular: http://rubular.com/r/GnjOmry5Z2
24
28
  BETWEEN_QUOTE_JA_REGEX = /\u{300c}(?>[^\u{300c}\u{300d}\\]+|\\{2}|\\.)*\u{300d}/
@@ -9,7 +9,9 @@ module PragmaticSegmenter
9
9
  ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
10
10
  ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
11
11
 
12
- class AbbreviationReplacer < AbbreviationReplacer
12
+ class AbbreviationReplacer < AbbreviationReplacer
13
+ SENTENCE_STARTERS = [].freeze
14
+
13
15
  private
14
16
 
15
17
  def scan_for_replacements(txt, am, index, character_array)
@@ -8,6 +8,10 @@ module PragmaticSegmenter
8
8
  PREPOSITIVE_ABBREVIATIONS = []
9
9
  NUMBER_ABBREVIATIONS = []
10
10
  end
11
+
12
+ class AbbreviationReplacer < AbbreviationReplacer
13
+ SENTENCE_STARTERS = [].freeze
14
+ end
11
15
  end
12
16
  end
13
17
  end
@@ -9,7 +9,9 @@ module PragmaticSegmenter
9
9
  NUMBER_ABBREVIATIONS = []
10
10
  end
11
11
 
12
- class AbbreviationReplacer < AbbreviationReplacer
12
+ class AbbreviationReplacer < AbbreviationReplacer
13
+ SENTENCE_STARTERS = [].freeze
14
+
13
15
  private
14
16
 
15
17
  def replace_period_of_abbr(txt, abbr)
@@ -8,6 +8,10 @@ module PragmaticSegmenter
8
8
  PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
9
9
  NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']
10
10
  end
11
+
12
+ class AbbreviationReplacer < AbbreviationReplacer
13
+ SENTENCE_STARTERS = [].freeze
14
+ end
11
15
  end
12
16
  end
13
17
  end
@@ -5,6 +5,10 @@ module PragmaticSegmenter
5
5
 
6
6
  SENTENCE_BOUNDARY_REGEX = /.*?[۔؟!\?]|.*?$/
7
7
  Punctuations = ['?', '!', '۔', '؟']
8
+
9
+ class AbbreviationReplacer < AbbreviationReplacer
10
+ SENTENCE_STARTERS = [].freeze
11
+ end
8
12
  end
9
13
  end
10
14
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.3.5"
2
+ VERSION = "0.3.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-04 00:00:00.000000000 Z
11
+ date: 2016-01-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler