pragmatic_segmenter 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a2951fa2242e1eb7ce0898862d41fd5239d871f8
4
- data.tar.gz: 01d84b2637e84598907ef08e59ec64e90855c2ec
3
+ metadata.gz: 3660e1361fcdab9e88c4a53530ee3c6b63afce92
4
+ data.tar.gz: baf77669ff3b7ee9d5c5172206ed79c6ec966948
5
5
  SHA512:
6
- metadata.gz: 25ec02bc41c649a57b1bc993848bed546514568c74e2e0ec5b6a2242fbbb579e61e199210f8ea4349ceadee5a06ad0fa76c7c60196762903cab984bbad3c2286
7
- data.tar.gz: 925a24d8c131813ddb4f80a8865697baa0efe05ef781cabfc13dfb9b1efc878c2e6e276a46a9f1301049a99b2f64a558b6a9911e0a2f5f8f4c17622df3fd0b2a
6
+ metadata.gz: 901a24ad1a48ebca4e2daf88bc593e8b82e9fd1f8c7eee28ff81de4c32757571dc549e439d9fd6bba557846f3428e57a8290acba6d524516b237830cea9cdc7c
7
+ data.tar.gz: 9a83f91f6e3a6d22d97498173c486b7e0a608563f3ed77091c409510242e7252db1fed6a448bd29089fdb5da41c0dd0f21660c6a97c35e87862928775d9f7abe
data/README.md CHANGED
@@ -761,7 +761,10 @@ To test the relative performance of different segmentation tools and libraries I
761
761
  * Fix missing abbreviations
762
762
 
763
763
  **Version 0.1.5**
764
- * Fix comma at end of quoatation bug
764
+ * Fix comma at end of quotation bug
765
+
766
+ **Version 0.1.6**
767
+ * Fix bug in numbered list finder (ignore longer digits)
765
768
 
766
769
  ## Contributing
767
770
 
@@ -16,19 +16,19 @@ module PragmaticSegmenter
16
16
  ListMarkerRule = Rule.new(/☝/, '')
17
17
 
18
18
  # Rubular: http://rubular.com/r/Wv4qLdoPx7
19
- SpaceBetweenListItemsFirstRule = Rule.new(/(?<=\S\S|^)\s(?=\S\s*\d+♨)/, "\r")
19
+ SpaceBetweenListItemsFirstRule = Rule.new(/(?<=\S\S|^)\s(?=\S\s*\d{1,2}♨)/, "\r")
20
20
 
21
21
  # Rubular: http://rubular.com/r/AizHXC6HxK
22
- SpaceBetweenListItemsSecondRule = Rule.new(/(?<=\S\S|^)\s(?=\d+♨)/, "\r")
22
+ SpaceBetweenListItemsSecondRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}♨)/, "\r")
23
23
 
24
24
  # Rubular: http://rubular.com/r/GE5q6yID2j
25
- SpaceBetweenListItemsThirdRule = Rule.new(/(?<=\S\S|^)\s(?=\d+☝)/, "\r")
25
+ SpaceBetweenListItemsThirdRule = Rule.new(/(?<=\S\S|^)\s(?=\d{1,2}☝)/, "\r")
26
26
 
27
27
  NUMBERED_LIST_REGEX_1 =
28
- /\s\d+(?=\.\s)|^\d+(?=\.\s)|\s\d+(?=\.\))|^\d+(?=\.\))|(?<=\s\-)\d+(?=\.\s)|(?<=^\-)\d+(?=\.\s)|(?<=\s\⁃)\d+(?=\.\s)|(?<=^\⁃)\d+(?=\.\s)|(?<=s\-)\d+(?=\.\))|(?<=^\-)\d+(?=\.\))|(?<=\s\⁃)\d+(?=\.\))|(?<=^\⁃)\d+(?=\.\))/
28
+ /\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))/
29
29
  NUMBERED_LIST_REGEX_2 =
30
- /(?<=\s)\d+\.(?=\s)|^\d+\.(?=\s)|(?<=\s)\d+\.(?=\))|^\d+\.(?=\))|(?<=\s\-)\d+\.(?=\s)|(?<=^\-)\d+\.(?=\s)|(?<=\s\⁃)\d+\.(?=\s)|(?<=^\⁃)\d+\.(?=\s)|(?<=\s\-)\d+\.(?=\))|(?<=^\-)\d+\.(?=\))|(?<=\s\⁃)\d+\.(?=\))|(?<=^\⁃)\d+\.(?=\))/
31
- NUMBERED_LIST_PARENS_REGEX = /\d+(?=\)\s)/
30
+ /(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))/
31
+ NUMBERED_LIST_PARENS_REGEX = /\d{1,2}(?=\)\s)/
32
32
 
33
33
  # Rubular: http://rubular.com/r/NsNFSqrNvJ
34
34
  EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX =
@@ -81,7 +81,7 @@ module PragmaticSegmenter
81
81
  def add_line_breaks_for_numbered_list_with_periods(txt)
82
82
  return txt unless txt.include?('♨') &&
83
83
  txt !~ /♨.+\n.+♨|♨.+\r.+♨/ &&
84
- txt !~ /for\s\d+♨\s[a-z]/
84
+ txt !~ /for\s\d{1,2}♨\s[a-z]/
85
85
  txt.apply(SpaceBetweenListItemsFirstRule).
86
86
  apply(SpaceBetweenListItemsSecondRule)
87
87
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
@@ -933,6 +933,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
933
933
  ps = PragmaticSegmenter::Segmenter.new(text: "\"It's a good thing that the water is really calm,\" I answered ironically.")
934
934
  expect(ps.segment).to eq(["\"It's a good thing that the water is really calm,\" I answered ironically."])
935
935
  end
936
+
937
+ it "correctly segments text #093" do
938
+ ps = PragmaticSegmenter::Segmenter.new(text: "December 31, 1988. Hello world. It's great! \nBorn April 05, 1989.")
939
+ expect(ps.segment).to eq(["December 31, 1988.", "Hello world.", "It's great!", "Born April 05, 1989."])
940
+ end
936
941
  end
937
942
  end
938
943
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-13 00:00:00.000000000 Z
11
+ date: 2015-01-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler