pragmatic_segmenter 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +4 -1
- data/lib/pragmatic_segmenter/list.rb +7 -7
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +5 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3660e1361fcdab9e88c4a53530ee3c6b63afce92
|
4
|
+
data.tar.gz: baf77669ff3b7ee9d5c5172206ed79c6ec966948
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 901a24ad1a48ebca4e2daf88bc593e8b82e9fd1f8c7eee28ff81de4c32757571dc549e439d9fd6bba557846f3428e57a8290acba6d524516b237830cea9cdc7c
|
7
|
+
data.tar.gz: 9a83f91f6e3a6d22d97498173c486b7e0a608563f3ed77091c409510242e7252db1fed6a448bd29089fdb5da41c0dd0f21660c6a97c35e87862928775d9f7abe
|
data/README.md
CHANGED
@@ -761,7 +761,10 @@ To test the relative performance of different segmentation tools and libraries I
|
|
761
761
|
* Fix missing abbreviations
|
762
762
|
|
763
763
|
**Version 0.1.5**
|
764
|
-
* Fix comma at end of
|
764
|
+
* Fix comma at end of quotation bug
|
765
|
+
|
766
|
+
**Version 0.1.6**
|
767
|
+
* Fix bug in numbered list finder (ignore longer digits)
|
765
768
|
|
766
769
|
## Contributing
|
767
770
|
|
@@ -16,19 +16,19 @@ module PragmaticSegmenter
|
|
16
16
|
ListMarkerRule = Rule.new(/☝/, '')
|
17
17
|
|
18
18
|
# Rubular: http://rubular.com/r/Wv4qLdoPx7
|
19
|
-
SpaceBetweenListItemsFirstRule = Rule.new(/(?<=\S\S|^)\s(?=\S\s*\d
|
19
|
+
SpaceBetweenListItemsFirstRule = Rule.new(/(?<=\S\S|^)\s(?=\S\s*\d{1,2}♨)/, "\r")
|
20
20
|
|
21
21
|
# Rubular: http://rubular.com/r/AizHXC6HxK
|
22
|
-
SpaceBetweenListItemsSecondRule = Rule.new(/(?<=\S\S|^)\s(?=\d
|
22
|
+
SpaceBetweenListItemsSecondRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}♨)/, "\r")
|
23
23
|
|
24
24
|
# Rubular: http://rubular.com/r/GE5q6yID2j
|
25
|
-
SpaceBetweenListItemsThirdRule = Rule.new(/(?<=\S\S|^)\s(?=\d
|
25
|
+
SpaceBetweenListItemsThirdRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}☝)/, "\r")
|
26
26
|
|
27
27
|
NUMBERED_LIST_REGEX_1 =
|
28
|
-
/\s\d
|
28
|
+
/\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))/
|
29
29
|
NUMBERED_LIST_REGEX_2 =
|
30
|
-
/(?<=\s)\d
|
31
|
-
NUMBERED_LIST_PARENS_REGEX = /\d
|
30
|
+
/(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))/
|
31
|
+
NUMBERED_LIST_PARENS_REGEX = /\d{1,2}(?=\)\s)/
|
32
32
|
|
33
33
|
# Rubular: http://rubular.com/r/NsNFSqrNvJ
|
34
34
|
EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX =
|
@@ -81,7 +81,7 @@ module PragmaticSegmenter
|
|
81
81
|
def add_line_breaks_for_numbered_list_with_periods(txt)
|
82
82
|
return txt unless txt.include?('♨') &&
|
83
83
|
txt !~ /♨.+\n.+♨|♨.+\r.+♨/ &&
|
84
|
-
txt !~ /for\s\d
|
84
|
+
txt !~ /for\s\d{1,2}♨\s[a-z]/
|
85
85
|
txt.apply(SpaceBetweenListItemsFirstRule).
|
86
86
|
apply(SpaceBetweenListItemsSecondRule)
|
87
87
|
end
|
@@ -933,6 +933,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
933
933
|
ps = PragmaticSegmenter::Segmenter.new(text: "\"It's a good thing that the water is really calm,\" I answered ironically.")
|
934
934
|
expect(ps.segment).to eq(["\"It's a good thing that the water is really calm,\" I answered ironically."])
|
935
935
|
end
|
936
|
+
|
937
|
+
it "correctly segments text #093" do
|
938
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "December 31, 1988. Hello world. It's great! \nBorn April 05, 1989.")
|
939
|
+
expect(ps.segment).to eq(["December 31, 1988.", "Hello world.", "It's great!", "Born April 05, 1989."])
|
940
|
+
end
|
936
941
|
end
|
937
942
|
end
|
938
943
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
11
|
+
date: 2015-01-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|