pragmatic_segmenter 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +11 -0
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +5 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a6efcebc92c7cc21017d9acacea4283de1003094
|
4
|
+
data.tar.gz: 0c64c57f9c5e506848295199e9c8195cce89a8d3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: edc26a49545a32ae220caac43979861ebe2f28fb892d857d5c920bb857df38a62bfe161ed268168710b155510c780edf1d625e3f61e7e7bdaeaa54c52a1f87aa
|
7
|
+
data.tar.gz: 970301bbcb8e984e1c9d3c8ed222831d35fa7a0731af9b9916ca593d746dbad9db2922d20af5690be810de1100df021ef799c93c2e03901381681fb99980ee8a
|
data/README.md
CHANGED
@@ -747,6 +747,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
747
747
|
**Version 0.1.0**
|
748
748
|
* Add Kommanditgesellschaft Rule
|
749
749
|
|
750
|
+
**Version 0.1.1**
|
751
|
+
* Fix handling of German dates
|
752
|
+
|
750
753
|
## Contributing
|
751
754
|
|
752
755
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -32,9 +32,20 @@ module PragmaticSegmenter
|
|
32
32
|
# Rubular: http://rubular.com/r/ityNMwdghj
|
33
33
|
NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
|
34
34
|
|
35
|
+
DE_MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
|
36
|
+
|
35
37
|
def replace
|
36
38
|
super
|
37
39
|
@text.apply(NumberPeriodSpaceRule).apply(NegativeNumberPeriodSpaceRule)
|
40
|
+
replace_period_in_deutsch_dates(@text)
|
41
|
+
end
|
42
|
+
|
43
|
+
def replace_period_in_deutsch_dates(txt)
|
44
|
+
DE_MONTHS.each do |month|
|
45
|
+
# Rubular: http://rubular.com/r/zlqgj7G5dA
|
46
|
+
txt.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯')
|
47
|
+
end
|
48
|
+
txt
|
38
49
|
end
|
39
50
|
end
|
40
51
|
|
@@ -1508,6 +1508,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
1508
1508
|
ps = PragmaticSegmenter::Segmenter.new(text: "s. vorherige Anmerkung.", language: 'de')
|
1509
1509
|
expect(ps.segment).to eq(["s. vorherige Anmerkung."])
|
1510
1510
|
end
|
1511
|
+
|
1512
|
+
it 'correctly segments text #033' do
|
1513
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Mit Inkrafttreten des Mindestlohngesetzes (MiLoG) zum 01. Januar 2015 werden in Bezug auf den Einsatz von Leistungs.", language: 'de')
|
1514
|
+
expect(ps.segment).to eq(["Mit Inkrafttreten des Mindestlohngesetzes (MiLoG) zum 01. Januar 2015 werden in Bezug auf den Einsatz von Leistungs."])
|
1515
|
+
end
|
1511
1516
|
end
|
1512
1517
|
end
|
1513
1518
|
|