pragmatic_segmenter 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c54f1c34c1f0bd34858cbd9c3915ec47f8a4d807
4
- data.tar.gz: 19a936bfd8fb6c0046f06a20f50319fb5a2c0ed7
3
+ metadata.gz: 96aa232b769e566aad1de6ace0fa6d8591e3859c
4
+ data.tar.gz: 2ed0518fb7c236821b0986795e7bc1ce5e19e1a8
5
5
  SHA512:
6
- metadata.gz: 824392373d56549289ae89de98976d53afdf74ccde45f3b6069164d8882fbeef7ae04e2fec1261af2baf43223246b4f1f90ebab12b97fa0c22e0d81d6caa56fa
7
- data.tar.gz: 4f43e20c69be0515d81c9d98e33b7ed2aadf05d5110ddd7acae3f971745c8406c03763fdc275afa3b14de3d5aa8846a85638845d90cc4c9fd34f4cf9e120d1d3
6
+ metadata.gz: fef9536b2c2dc9896aedcb363f89bc10af37c09d66c034d8814ccb784619a9ea82fa3d20e9e69316dfcef06d7013183d9dcbd564430dc7ce2303a7cfbe32ba87
7
+ data.tar.gz: 813a64e15c83b9dab5a2fe29c0bebba638fba0663c3debb26442cd42f060fa87e927846309355ec15953ca0a45c90d764fae796533f7d054282bdf8b5bcb06d4
data/README.md CHANGED
@@ -744,6 +744,9 @@ To test the relative performance of different segmentation tools and libraries I
744
744
  **Version 0.0.9**
745
745
  * Improve handling of alphabetical and roman numeral lists
746
746
 
747
+ **Version 0.1.0**
748
+ * Add Kommanditgesellschaft Rule
749
+
747
750
  ## Contributing
748
751
 
749
752
  If you find a text that is incorrectly segmented using this gem, please submit an issue.
@@ -9,6 +9,9 @@ module PragmaticSegmenter
9
9
  # Rubular: http://rubular.com/r/yqa4Rit8EY
10
10
  PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
11
11
 
12
+ # Rubular: http://rubular.com/r/NEv265G2X2
13
+ KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '∯')
14
+
12
15
  # Rubular: http://rubular.com/r/xDkpFZ0EgH
13
16
  MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
14
17
 
@@ -37,6 +40,7 @@ module PragmaticSegmenter
37
40
 
38
41
  def replace
39
42
  @reformatted_text = text.apply(PossessiveAbbreviationRule)
43
+ @reformatted_text = text.apply(KommanditgesellschaftRule)
40
44
  @reformatted_text = PragmaticSegmenter::SingleLetterAbbreviation.new(text: @reformatted_text).replace
41
45
  @reformatted_text = search_for_abbreviations_in_string(@reformatted_text, abbreviations)
42
46
  @reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.0.9"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -913,6 +913,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
913
913
  ps = PragmaticSegmenter::Segmenter.new(text: "A) Hello world. \nB) Hello world.\nC) Hello world.\nD) Hello world.\nE) Hello world.\nF) Hello world.", language: "en")
914
914
  expect(ps.segment).to eq(["A) Hello world.", "B) Hello world.", "C) Hello world.", "D) Hello world.", "E) Hello world.", "F) Hello world."])
915
915
  end
916
+
917
+ it "correctly segments text #089" do
918
+ ps = PragmaticSegmenter::Segmenter.new(text: "The GmbH & Co. KG is a limited partnership with, typically, the sole general partner being a limited liability company.")
919
+ expect(ps.segment).to eq(["The GmbH & Co. KG is a limited partnership with, typically, the sole general partner being a limited liability company."])
920
+ end
916
921
  end
917
922
  end
918
923
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias