pragmatic_segmenter 0.0.9 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 96aa232b769e566aad1de6ace0fa6d8591e3859c
|
4
|
+
data.tar.gz: 2ed0518fb7c236821b0986795e7bc1ce5e19e1a8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fef9536b2c2dc9896aedcb363f89bc10af37c09d66c034d8814ccb784619a9ea82fa3d20e9e69316dfcef06d7013183d9dcbd564430dc7ce2303a7cfbe32ba87
|
7
|
+
data.tar.gz: 813a64e15c83b9dab5a2fe29c0bebba638fba0663c3debb26442cd42f060fa87e927846309355ec15953ca0a45c90d764fae796533f7d054282bdf8b5bcb06d4
|
data/README.md
CHANGED
@@ -744,6 +744,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
744
744
|
**Version 0.0.9**
|
745
745
|
* Improve handling of alphabetical and roman numeral lists
|
746
746
|
|
747
|
+
**Version 0.1.0**
|
748
|
+
* Add Kommanditgesellschaft Rule
|
749
|
+
|
747
750
|
## Contributing
|
748
751
|
|
749
752
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -9,6 +9,9 @@ module PragmaticSegmenter
|
|
9
9
|
# Rubular: http://rubular.com/r/yqa4Rit8EY
|
10
10
|
PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
|
11
11
|
|
12
|
+
# Rubular: http://rubular.com/r/NEv265G2X2
|
13
|
+
KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '∯')
|
14
|
+
|
12
15
|
# Rubular: http://rubular.com/r/xDkpFZ0EgH
|
13
16
|
MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
|
14
17
|
|
@@ -37,6 +40,7 @@ module PragmaticSegmenter
|
|
37
40
|
|
38
41
|
def replace
|
39
42
|
@reformatted_text = text.apply(PossessiveAbbreviationRule)
|
43
|
+
@reformatted_text = text.apply(KommanditgesellschaftRule)
|
40
44
|
@reformatted_text = PragmaticSegmenter::SingleLetterAbbreviation.new(text: @reformatted_text).replace
|
41
45
|
@reformatted_text = search_for_abbreviations_in_string(@reformatted_text, abbreviations)
|
42
46
|
@reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
|
@@ -913,6 +913,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
913
913
|
ps = PragmaticSegmenter::Segmenter.new(text: "A) Hello world. \nB) Hello world.\nC) Hello world.\nD) Hello world.\nE) Hello world.\nF) Hello world.", language: "en")
|
914
914
|
expect(ps.segment).to eq(["A) Hello world.", "B) Hello world.", "C) Hello world.", "D) Hello world.", "E) Hello world.", "F) Hello world."])
|
915
915
|
end
|
916
|
+
|
917
|
+
it "correctly segments text #089" do
|
918
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "The GmbH & Co. KG is a limited partnership with, typically, the sole general partner being a limited liability company.")
|
919
|
+
expect(ps.segment).to eq(["The GmbH & Co. KG is a limited partnership with, typically, the sole general partner being a limited liability company."])
|
920
|
+
end
|
916
921
|
end
|
917
922
|
end
|
918
923
|
|