pragmatic_segmenter 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a2951fa2242e1eb7ce0898862d41fd5239d871f8
4
- data.tar.gz: 01d84b2637e84598907ef08e59ec64e90855c2ec
3
+ metadata.gz: 3660e1361fcdab9e88c4a53530ee3c6b63afce92
4
+ data.tar.gz: baf77669ff3b7ee9d5c5172206ed79c6ec966948
5
5
  SHA512:
6
- metadata.gz: 25ec02bc41c649a57b1bc993848bed546514568c74e2e0ec5b6a2242fbbb579e61e199210f8ea4349ceadee5a06ad0fa76c7c60196762903cab984bbad3c2286
7
- data.tar.gz: 925a24d8c131813ddb4f80a8865697baa0efe05ef781cabfc13dfb9b1efc878c2e6e276a46a9f1301049a99b2f64a558b6a9911e0a2f5f8f4c17622df3fd0b2a
6
+ metadata.gz: 901a24ad1a48ebca4e2daf88bc593e8b82e9fd1f8c7eee28ff81de4c32757571dc549e439d9fd6bba557846f3428e57a8290acba6d524516b237830cea9cdc7c
7
+ data.tar.gz: 9a83f91f6e3a6d22d97498173c486b7e0a608563f3ed77091c409510242e7252db1fed6a448bd29089fdb5da41c0dd0f21660c6a97c35e87862928775d9f7abe
data/README.md CHANGED
@@ -761,7 +761,10 @@ To test the relative performance of different segmentation tools and libraries I
761
761
  * Fix missing abbreviations
762
762
 
763
763
  **Version 0.1.5**
764
- * Fix comma at end of quoatation bug
764
+ * Fix comma at end of quotation bug
765
+
766
+ **Version 0.1.6**
767
+ * Fix bug in numbered list finder (ignore longer digits)
765
768
 
766
769
  ## Contributing
767
770
 
@@ -16,19 +16,19 @@ module PragmaticSegmenter
16
16
  ListMarkerRule = Rule.new(/☝/, '')
17
17
 
18
18
  # Rubular: http://rubular.com/r/Wv4qLdoPx7
19
- SpaceBetweenListItemsFirstRule = Rule.new(/(?<=\S\S|^)\s(?=\S\s*\d+♨)/, "\r")
19
+ SpaceBetweenListItemsFirstRule = Rule.new(/(?<=\S\S|^)\s(?=\S\s*\d{1,2}♨)/, "\r")
20
20
 
21
21
  # Rubular: http://rubular.com/r/AizHXC6HxK
22
- SpaceBetweenListItemsSecondRule = Rule.new(/(?<=\S\S|^)\s(?=\d+♨)/, "\r")
22
+ SpaceBetweenListItemsSecondRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}♨)/, "\r")
23
23
 
24
24
  # Rubular: http://rubular.com/r/GE5q6yID2j
25
- SpaceBetweenListItemsThirdRule = Rule.new(/(?<=\S\S|^)\s(?=\d+☝)/, "\r")
25
+ SpaceBetweenListItemsThirdRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}☝)/, "\r")
26
26
 
27
27
  NUMBERED_LIST_REGEX_1 =
28
- /\s\d+(?=\.\s)|^\d+(?=\.\s)|\s\d+(?=\.\))|^\d+(?=\.\))|(?<=\s\-)\d+(?=\.\s)|(?<=^\-)\d+(?=\.\s)|(?<=\s\⁃)\d+(?=\.\s)|(?<=^\⁃)\d+(?=\.\s)|(?<=s\-)\d+(?=\.\))|(?<=^\-)\d+(?=\.\))|(?<=\s\⁃)\d+(?=\.\))|(?<=^\⁃)\d+(?=\.\))/
28
+ /\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))/
29
29
  NUMBERED_LIST_REGEX_2 =
30
- /(?<=\s)\d+\.(?=\s)|^\d+\.(?=\s)|(?<=\s)\d+\.(?=\))|^\d+\.(?=\))|(?<=\s\-)\d+\.(?=\s)|(?<=^\-)\d+\.(?=\s)|(?<=\s\⁃)\d+\.(?=\s)|(?<=^\⁃)\d+\.(?=\s)|(?<=\s\-)\d+\.(?=\))|(?<=^\-)\d+\.(?=\))|(?<=\s\⁃)\d+\.(?=\))|(?<=^\⁃)\d+\.(?=\))/
31
- NUMBERED_LIST_PARENS_REGEX = /\d+(?=\)\s)/
30
+ /(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))/
31
+ NUMBERED_LIST_PARENS_REGEX = /\d{1,2}(?=\)\s)/
32
32
 
33
33
  # Rubular: http://rubular.com/r/NsNFSqrNvJ
34
34
  EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX =
@@ -81,7 +81,7 @@ module PragmaticSegmenter
81
81
  def add_line_breaks_for_numbered_list_with_periods(txt)
82
82
  return txt unless txt.include?('♨') &&
83
83
  txt !~ /♨.+\n.+♨|♨.+\r.+♨/ &&
84
- txt !~ /for\s\d+♨\s[a-z]/
84
+ txt !~ /for\s\d{1,2}♨\s[a-z]/
85
85
  txt.apply(SpaceBetweenListItemsFirstRule).
86
86
  apply(SpaceBetweenListItemsSecondRule)
87
87
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
@@ -933,6 +933,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
933
933
  ps = PragmaticSegmenter::Segmenter.new(text: "\"It's a good thing that the water is really calm,\" I answered ironically.")
934
934
  expect(ps.segment).to eq(["\"It's a good thing that the water is really calm,\" I answered ironically."])
935
935
  end
936
+
937
+ it "correctly segments text #093" do
938
+ ps = PragmaticSegmenter::Segmenter.new(text: "December 31, 1988. Hello world. It's great! \nBorn April 05, 1989.")
939
+ expect(ps.segment).to eq(["December 31, 1988.", "Hello world.", "It's great!", "Born April 05, 1989."])
940
+ end
936
941
  end
937
942
  end
938
943
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-13 00:00:00.000000000 Z
11
+ date: 2015-01-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler