pragmatic_segmenter 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 80eb7dfc7aeed8a66ff324dde7a87ea544e55e45
4
- data.tar.gz: c1f3daf78133d748d6cf02133937dad1e05ef0ee
3
+ metadata.gz: c54f1c34c1f0bd34858cbd9c3915ec47f8a4d807
4
+ data.tar.gz: 19a936bfd8fb6c0046f06a20f50319fb5a2c0ed7
5
5
  SHA512:
6
- metadata.gz: f2c6018bb4d46ccc5ef86bbc548437861fa83cf4948abd09e45d8c4b23c021c48907feb9cd65916a9ef8635ede477f5f0b8e221cb7452b38c236f79a3d0cfa77
7
- data.tar.gz: d63e6eb39e52306785e491f1bfd1d4196e992e513a857b8893ef9c24fcab95f515dc863789f5040609436dc116efbac608a7ac010c0015fa159535198b1554ea
6
+ metadata.gz: 824392373d56549289ae89de98976d53afdf74ccde45f3b6069164d8882fbeef7ae04e2fec1261af2baf43223246b4f1f90ebab12b97fa0c22e0d81d6caa56fa
7
+ data.tar.gz: 4f43e20c69be0515d81c9d98e33b7ed2aadf05d5110ddd7acae3f971745c8406c03763fdc275afa3b14de3d5aa8846a85638845d90cc4c9fd34f4cf9e120d1d3
data/README.md CHANGED
@@ -741,6 +741,9 @@ To test the relative performance of different segmentation tools and libraries I
741
741
  **Version 0.0.8**
742
742
  * Fix error in `list.rb`
743
743
 
744
+ **Version 0.0.9**
745
+ * Improve handling of alphabetical and roman numeral lists
746
+
744
747
  ## Contributing
745
748
 
746
749
  If you find a text that is incorrectly segmented using this gem, please submit an issue.
@@ -32,7 +32,7 @@ module PragmaticSegmenter
32
32
 
33
33
  # Rubular: http://rubular.com/r/NsNFSqrNvJ
34
34
  EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX =
35
- /(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i
35
+ /\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i
36
36
 
37
37
  # Rubular: http://rubular.com/r/wMpnVedEIb
38
38
  ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX =
@@ -134,10 +134,10 @@ module PragmaticSegmenter
134
134
 
135
135
  def replace_alphabet_list_parens(a, txt)
136
136
  txt.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
137
- if txt =~ /\(#{Regexp.escape(m.to_s)}\)/i
138
- a.eql?(m.dup.downcase) ? "\rȸ(#{Regexp.escape(m.to_s)}" : "#{m}"
137
+ if m.include?('(')
138
+ a.eql?(m.dup.downcase.gsub!(/\(/, '')) ? "\r&✂&#{Regexp.escape(m.gsub!(/\(/, ''))}" : "#{m}"
139
139
  else
140
- a.eql?(m.dup.downcase) ? "\r#{Regexp.escape(m.to_s)}" : "#{m}"
140
+ a.eql?(m.dup.downcase) ? "\r#{Regexp.escape(m)}" : "#{m}"
141
141
  end
142
142
  end
143
143
  end
@@ -175,6 +175,7 @@ module PragmaticSegmenter
175
175
  else
176
176
  alphabet = ('a'..'z').to_a
177
177
  end
178
+ list_array.delete_if { |item| !alphabet.any? { |a| a.include?(item) } }
178
179
  list_array.each_with_index do |a, i|
179
180
  if i.eql?(list_array.length - 1)
180
181
  last_array_item_replacement(a, i, alphabet, list_array, txt, parens)
@@ -61,6 +61,7 @@ module PragmaticSegmenter
61
61
  MixedDoubleQQ = Rule.new(/☇/, '??')
62
62
  MixedDoubleEQ = Rule.new(/☈/, '!?')
63
63
  MixedDoubleEE = Rule.new(/☄/, '!!')
64
+ LeftParens = Rule.new(/&✂&/, '(')
64
65
  TemporaryEndingPunctutation = Rule.new('ȸ', '')
65
66
  Newline = Rule.new(/ȹ/, "\n")
66
67
 
@@ -70,7 +71,8 @@ module PragmaticSegmenter
70
71
  ExclamationPoint, QuestionMark,
71
72
  FullWidthQuestionMark, MixedDoubleQE,
72
73
  MixedDoubleQQ, MixedDoubleEQ,
73
- MixedDoubleEE, TemporaryEndingPunctutation,
74
+ MixedDoubleEE, LeftParens,
75
+ TemporaryEndingPunctutation,
74
76
  Newline ]
75
77
  end
76
78
 
@@ -3,7 +3,7 @@
3
3
  module PragmaticSegmenter
4
4
  # This class splits text at sentence boundary punctuation marks
5
5
  class SentenceBoundaryPunctuation
6
- SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)])*\)(?=\s[A-Z])|'(?:[^'])*'(?=\s[A-Z])|"(?:[^"])*"(?=\s[A-Z])|“(?:[^”])*”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
6
+ SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*'(?=\s[A-Z])|"(?:[^"])*"(?=\s[A-Z])|“(?:[^”])*”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
7
7
 
8
8
  attr_reader :text
9
9
  def initialize(text:)
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.9"
3
3
  end
@@ -900,7 +900,7 @@ RSpec.describe PragmaticSegmenter::Segmenter do
900
900
  end
901
901
 
902
902
  it "correctly segments text #086" do
903
- ps = PragmaticSegmenter::Segmenter.new(text: "(a) Hello world. \n(b) Hello world.\n(c) Hello world.\n(d) Hello world.\n(e) Hello world.\n(f) Hello world.", language: "en")
903
+ ps = PragmaticSegmenter::Segmenter.new(text: "(a) Hello world. (b) Hello world. (c) Hello world. (d) Hello world. (e) Hello world.\n(f) Hello world.", language: "en")
904
904
  expect(ps.segment).to eq(["(a) Hello world.", "(b) Hello world.", "(c) Hello world.", "(d) Hello world.", "(e) Hello world.", "(f) Hello world."])
905
905
  end
906
906
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias