pragmatic_segmenter 0.0.9 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 96aa232b769e566aad1de6ace0fa6d8591e3859c
|
4
|
+
data.tar.gz: 2ed0518fb7c236821b0986795e7bc1ce5e19e1a8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fef9536b2c2dc9896aedcb363f89bc10af37c09d66c034d8814ccb784619a9ea82fa3d20e9e69316dfcef06d7013183d9dcbd564430dc7ce2303a7cfbe32ba87
|
7
|
+
data.tar.gz: 813a64e15c83b9dab5a2fe29c0bebba638fba0663c3debb26442cd42f060fa87e927846309355ec15953ca0a45c90d764fae796533f7d054282bdf8b5bcb06d4
|
data/README.md
CHANGED
@@ -744,6 +744,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
744
744
|
**Version 0.0.9**
|
745
745
|
* Improve handling of alphabetical and roman numeral lists
|
746
746
|
|
747
|
+
**Version 0.1.0**
|
748
|
+
* Add Kommanditgesellschaft Rule
|
749
|
+
|
747
750
|
## Contributing
|
748
751
|
|
749
752
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -9,6 +9,9 @@ module PragmaticSegmenter
|
|
9
9
|
# Rubular: http://rubular.com/r/yqa4Rit8EY
|
10
10
|
PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
|
11
11
|
|
12
|
+
# Rubular: http://rubular.com/r/NEv265G2X2
|
13
|
+
KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '∯')
|
14
|
+
|
12
15
|
# Rubular: http://rubular.com/r/xDkpFZ0EgH
|
13
16
|
MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
|
14
17
|
|
@@ -37,6 +40,7 @@ module PragmaticSegmenter
|
|
37
40
|
|
38
41
|
def replace
|
39
42
|
@reformatted_text = text.apply(PossessiveAbbreviationRule)
|
43
|
+
@reformatted_text = text.apply(KommanditgesellschaftRule)
|
40
44
|
@reformatted_text = PragmaticSegmenter::SingleLetterAbbreviation.new(text: @reformatted_text).replace
|
41
45
|
@reformatted_text = search_for_abbreviations_in_string(@reformatted_text, abbreviations)
|
42
46
|
@reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
|
@@ -913,6 +913,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
913
913
|
ps = PragmaticSegmenter::Segmenter.new(text: "A) Hello world. \nB) Hello world.\nC) Hello world.\nD) Hello world.\nE) Hello world.\nF) Hello world.", language: "en")
|
914
914
|
expect(ps.segment).to eq(["A) Hello world.", "B) Hello world.", "C) Hello world.", "D) Hello world.", "E) Hello world.", "F) Hello world."])
|
915
915
|
end
|
916
|
+
|
917
|
+
it "correctly segments text #089" do
|
918
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "The GmbH & Co. KG is a limited partnership with, typically, the sole general partner being a limited liability company.")
|
919
|
+
expect(ps.segment).to eq(["The GmbH & Co. KG is a limited partnership with, typically, the sole general partner being a limited liability company."])
|
920
|
+
end
|
916
921
|
end
|
917
922
|
end
|
918
923
|
|