pragmatic_segmenter 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -1
- data/lib/pragmatic_segmenter/list.rb +7 -7
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +5 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3660e1361fcdab9e88c4a53530ee3c6b63afce92
|
4
|
+
data.tar.gz: baf77669ff3b7ee9d5c5172206ed79c6ec966948
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 901a24ad1a48ebca4e2daf88bc593e8b82e9fd1f8c7eee28ff81de4c32757571dc549e439d9fd6bba557846f3428e57a8290acba6d524516b237830cea9cdc7c
|
7
|
+
data.tar.gz: 9a83f91f6e3a6d22d97498173c486b7e0a608563f3ed77091c409510242e7252db1fed6a448bd29089fdb5da41c0dd0f21660c6a97c35e87862928775d9f7abe
|
data/README.md
CHANGED
@@ -761,7 +761,10 @@ To test the relative performance of different segmentation tools and libraries I
|
|
761
761
|
* Fix missing abbreviations
|
762
762
|
|
763
763
|
**Version 0.1.5**
|
764
|
-
* Fix comma at end of
|
764
|
+
* Fix comma at end of quotation bug
|
765
|
+
|
766
|
+
**Version 0.1.6**
|
767
|
+
* Fix bug in numbered list finder (ignore longer digits)
|
765
768
|
|
766
769
|
## Contributing
|
767
770
|
|
@@ -16,19 +16,19 @@ module PragmaticSegmenter
|
|
16
16
|
ListMarkerRule = Rule.new(/☝/, '')
|
17
17
|
|
18
18
|
# Rubular: http://rubular.com/r/Wv4qLdoPx7
|
19
|
-
SpaceBetweenListItemsFirstRule = Rule.new(/(?<=\S\S|^)\s(?=\S\s*\d
|
19
|
+
SpaceBetweenListItemsFirstRule = Rule.new(/(?<=\S\S|^)\s(?=\S\s*\d{1,2}♨)/, "\r")
|
20
20
|
|
21
21
|
# Rubular: http://rubular.com/r/AizHXC6HxK
|
22
|
-
SpaceBetweenListItemsSecondRule = Rule.new(/(?<=\S\S|^)\s(?=\d
|
22
|
+
SpaceBetweenListItemsSecondRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}♨)/, "\r")
|
23
23
|
|
24
24
|
# Rubular: http://rubular.com/r/GE5q6yID2j
|
25
|
-
SpaceBetweenListItemsThirdRule = Rule.new(/(?<=\S\S|^)\s(?=\d
|
25
|
+
SpaceBetweenListItemsThirdRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}☝)/, "\r")
|
26
26
|
|
27
27
|
NUMBERED_LIST_REGEX_1 =
|
28
|
-
/\s\d
|
28
|
+
/\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))/
|
29
29
|
NUMBERED_LIST_REGEX_2 =
|
30
|
-
/(?<=\s)\d
|
31
|
-
NUMBERED_LIST_PARENS_REGEX = /\d
|
30
|
+
/(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))/
|
31
|
+
NUMBERED_LIST_PARENS_REGEX = /\d{1,2}(?=\)\s)/
|
32
32
|
|
33
33
|
# Rubular: http://rubular.com/r/NsNFSqrNvJ
|
34
34
|
EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX =
|
@@ -81,7 +81,7 @@ module PragmaticSegmenter
|
|
81
81
|
def add_line_breaks_for_numbered_list_with_periods(txt)
|
82
82
|
return txt unless txt.include?('♨') &&
|
83
83
|
txt !~ /♨.+\n.+♨|♨.+\r.+♨/ &&
|
84
|
-
txt !~ /for\s\d
|
84
|
+
txt !~ /for\s\d{1,2}♨\s[a-z]/
|
85
85
|
txt.apply(SpaceBetweenListItemsFirstRule).
|
86
86
|
apply(SpaceBetweenListItemsSecondRule)
|
87
87
|
end
|
@@ -933,6 +933,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
933
933
|
ps = PragmaticSegmenter::Segmenter.new(text: "\"It's a good thing that the water is really calm,\" I answered ironically.")
|
934
934
|
expect(ps.segment).to eq(["\"It's a good thing that the water is really calm,\" I answered ironically."])
|
935
935
|
end
|
936
|
+
|
937
|
+
it "correctly segments text #093" do
|
938
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "December 31, 1988. Hello world. It's great! \nBorn April 05, 1989.")
|
939
|
+
expect(ps.segment).to eq(["December 31, 1988.", "Hello world.", "It's great!", "Born April 05, 1989."])
|
940
|
+
end
|
936
941
|
end
|
937
942
|
end
|
938
943
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
11
|
+
date: 2015-01-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|