pragmatic_segmenter 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +11 -0
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +5 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a6efcebc92c7cc21017d9acacea4283de1003094
|
4
|
+
data.tar.gz: 0c64c57f9c5e506848295199e9c8195cce89a8d3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: edc26a49545a32ae220caac43979861ebe2f28fb892d857d5c920bb857df38a62bfe161ed268168710b155510c780edf1d625e3f61e7e7bdaeaa54c52a1f87aa
|
7
|
+
data.tar.gz: 970301bbcb8e984e1c9d3c8ed222831d35fa7a0731af9b9916ca593d746dbad9db2922d20af5690be810de1100df021ef799c93c2e03901381681fb99980ee8a
|
data/README.md
CHANGED
@@ -747,6 +747,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
747
747
|
**Version 0.1.0**
|
748
748
|
* Add Kommanditgesellschaft Rule
|
749
749
|
|
750
|
+
**Version 0.1.1**
|
751
|
+
* Fix handling of German dates
|
752
|
+
|
750
753
|
## Contributing
|
751
754
|
|
752
755
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -32,9 +32,20 @@ module PragmaticSegmenter
|
|
32
32
|
# Rubular: http://rubular.com/r/ityNMwdghj
|
33
33
|
NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
|
34
34
|
|
35
|
+
DE_MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
|
36
|
+
|
35
37
|
def replace
|
36
38
|
super
|
37
39
|
@text.apply(NumberPeriodSpaceRule).apply(NegativeNumberPeriodSpaceRule)
|
40
|
+
replace_period_in_deutsch_dates(@text)
|
41
|
+
end
|
42
|
+
|
43
|
+
def replace_period_in_deutsch_dates(txt)
|
44
|
+
DE_MONTHS.each do |month|
|
45
|
+
# Rubular: http://rubular.com/r/zlqgj7G5dA
|
46
|
+
txt.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯')
|
47
|
+
end
|
48
|
+
txt
|
38
49
|
end
|
39
50
|
end
|
40
51
|
|
@@ -1508,6 +1508,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
1508
1508
|
ps = PragmaticSegmenter::Segmenter.new(text: "s. vorherige Anmerkung.", language: 'de')
|
1509
1509
|
expect(ps.segment).to eq(["s. vorherige Anmerkung."])
|
1510
1510
|
end
|
1511
|
+
|
1512
|
+
it 'correctly segments text #033' do
|
1513
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Mit Inkrafttreten des Mindestlohngesetzes (MiLoG) zum 01. Januar 2015 werden in Bezug auf den Einsatz von Leistungs.", language: 'de')
|
1514
|
+
expect(ps.segment).to eq(["Mit Inkrafttreten des Mindestlohngesetzes (MiLoG) zum 01. Januar 2015 werden in Bezug auf den Einsatz von Leistungs."])
|
1515
|
+
end
|
1511
1516
|
end
|
1512
1517
|
end
|
1513
1518
|
|