pragmatic_segmenter 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 80eb7dfc7aeed8a66ff324dde7a87ea544e55e45
4
- data.tar.gz: c1f3daf78133d748d6cf02133937dad1e05ef0ee
3
+ metadata.gz: c54f1c34c1f0bd34858cbd9c3915ec47f8a4d807
4
+ data.tar.gz: 19a936bfd8fb6c0046f06a20f50319fb5a2c0ed7
5
5
  SHA512:
6
- metadata.gz: f2c6018bb4d46ccc5ef86bbc548437861fa83cf4948abd09e45d8c4b23c021c48907feb9cd65916a9ef8635ede477f5f0b8e221cb7452b38c236f79a3d0cfa77
7
- data.tar.gz: d63e6eb39e52306785e491f1bfd1d4196e992e513a857b8893ef9c24fcab95f515dc863789f5040609436dc116efbac608a7ac010c0015fa159535198b1554ea
6
+ metadata.gz: 824392373d56549289ae89de98976d53afdf74ccde45f3b6069164d8882fbeef7ae04e2fec1261af2baf43223246b4f1f90ebab12b97fa0c22e0d81d6caa56fa
7
+ data.tar.gz: 4f43e20c69be0515d81c9d98e33b7ed2aadf05d5110ddd7acae3f971745c8406c03763fdc275afa3b14de3d5aa8846a85638845d90cc4c9fd34f4cf9e120d1d3
data/README.md CHANGED
@@ -741,6 +741,9 @@ To test the relative performance of different segmentation tools and libraries I
741
741
  **Version 0.0.8**
742
742
  * Fix error in `list.rb`
743
743
 
744
+ **Version 0.0.9**
745
+ * Improve handling of alphabetical and roman numeral lists
746
+
744
747
  ## Contributing
745
748
 
746
749
  If you find a text that is incorrectly segmented using this gem, please submit an issue.
@@ -32,7 +32,7 @@ module PragmaticSegmenter
32
32
 
33
33
  # Rubular: http://rubular.com/r/NsNFSqrNvJ
34
34
  EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX =
35
- /(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i
35
+ /\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i
36
36
 
37
37
  # Rubular: http://rubular.com/r/wMpnVedEIb
38
38
  ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX =
@@ -134,10 +134,10 @@ module PragmaticSegmenter
134
134
 
135
135
  def replace_alphabet_list_parens(a, txt)
136
136
  txt.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
137
- if txt =~ /\(#{Regexp.escape(m.to_s)}\)/i
138
- a.eql?(m.dup.downcase) ? "\rȸ(#{Regexp.escape(m.to_s)}" : "#{m}"
137
+ if m.include?('(')
138
+ a.eql?(m.dup.downcase.gsub!(/\(/, '')) ? "\r&✂&#{Regexp.escape(m.gsub!(/\(/, ''))}" : "#{m}"
139
139
  else
140
- a.eql?(m.dup.downcase) ? "\r#{Regexp.escape(m.to_s)}" : "#{m}"
140
+ a.eql?(m.dup.downcase) ? "\r#{Regexp.escape(m)}" : "#{m}"
141
141
  end
142
142
  end
143
143
  end
@@ -175,6 +175,7 @@ module PragmaticSegmenter
175
175
  else
176
176
  alphabet = ('a'..'z').to_a
177
177
  end
178
+ list_array.delete_if { |item| !alphabet.any? { |a| a.include?(item) } }
178
179
  list_array.each_with_index do |a, i|
179
180
  if i.eql?(list_array.length - 1)
180
181
  last_array_item_replacement(a, i, alphabet, list_array, txt, parens)
@@ -61,6 +61,7 @@ module PragmaticSegmenter
61
61
  MixedDoubleQQ = Rule.new(/☇/, '??')
62
62
  MixedDoubleEQ = Rule.new(/☈/, '!?')
63
63
  MixedDoubleEE = Rule.new(/☄/, '!!')
64
+ LeftParens = Rule.new(/&✂&/, '(')
64
65
  TemporaryEndingPunctutation = Rule.new('ȸ', '')
65
66
  Newline = Rule.new(/ȹ/, "\n")
66
67
 
@@ -70,7 +71,8 @@ module PragmaticSegmenter
70
71
  ExclamationPoint, QuestionMark,
71
72
  FullWidthQuestionMark, MixedDoubleQE,
72
73
  MixedDoubleQQ, MixedDoubleEQ,
73
- MixedDoubleEE, TemporaryEndingPunctutation,
74
+ MixedDoubleEE, LeftParens,
75
+ TemporaryEndingPunctutation,
74
76
  Newline ]
75
77
  end
76
78
 
@@ -3,7 +3,7 @@
3
3
  module PragmaticSegmenter
4
4
  # This class splits text at sentence boundary punctuation marks
5
5
  class SentenceBoundaryPunctuation
6
- SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)])*\)(?=\s[A-Z])|'(?:[^'])*'(?=\s[A-Z])|"(?:[^"])*"(?=\s[A-Z])|“(?:[^”])*”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
6
+ SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*'(?=\s[A-Z])|"(?:[^"])*"(?=\s[A-Z])|“(?:[^”])*”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
7
7
 
8
8
  attr_reader :text
9
9
  def initialize(text:)
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.9"
3
3
  end
@@ -900,7 +900,7 @@ RSpec.describe PragmaticSegmenter::Segmenter do
900
900
  end
901
901
 
902
902
  it "correctly segments text #086" do
903
- ps = PragmaticSegmenter::Segmenter.new(text: "(a) Hello world. \n(b) Hello world.\n(c) Hello world.\n(d) Hello world.\n(e) Hello world.\n(f) Hello world.", language: "en")
903
+ ps = PragmaticSegmenter::Segmenter.new(text: "(a) Hello world. (b) Hello world. (c) Hello world. (d) Hello world. (e) Hello world.\n(f) Hello world.", language: "en")
904
904
  expect(ps.segment).to eq(["(a) Hello world.", "(b) Hello world.", "(c) Hello world.", "(d) Hello world.", "(e) Hello world.", "(f) Hello world."])
905
905
  end
906
906
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias