pragmatic_segmenter 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1ea9b87e9654b097486aa24301ec3da33c8b3b79
4
- data.tar.gz: 0db4660da0b5ebd3a1536d2ad29ddc023e09a81d
3
+ metadata.gz: 1e6be4ac5dd27c491dbbed17b572b6503adb2707
4
+ data.tar.gz: 63d4c42adb13a83e5aa99702a7d7be0a162de0ad
5
5
  SHA512:
6
- metadata.gz: 5737824cefe9a0c378e540857b0fc7122c7e89e738f1cb05722e7103346c8683be532b2c42ab21e199624d4f21f9a681511cb383cced542bfad30287d1c8893e
7
- data.tar.gz: 0bfa44c11c70dc1af5b87b55499b6fb2ae306e541e5253335db45fbbc060316d2fa0c78477c36cf574e21ded594f2ccab9e0a6693154816217079df5d495ad16
6
+ metadata.gz: c21a055c652ffee7f819b79dbf49da0aeb2a068c16b14edbf995ba30f545824c359f61066490b8b5d15ef356df21edc890ea8cfc831fc246eeb293b038868250
7
+ data.tar.gz: 985d1121bc725a65d5b3ace00e1f03f1efa1e9e8edac9ae8657a9b4cb6c736720ed000fa825a3ab9b30fce928cd25f689c6068e3292df1c65c7f3b808fd103a2
data/README.md CHANGED
@@ -637,6 +637,20 @@ Hola Srta. Ledesma. Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre
637
637
  => ["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"]
638
638
  ```
639
639
 
640
+ ####Golden Rules (Dutch)
641
+
642
+ 1.) **Sentence starting with a number**
643
+ ```
644
+ Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen. 81 procent van de schoten was raak.
645
+ => ["Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen.", "81 procent van de schoten was raak."]
646
+ ```
647
+
648
+ 2.) **Sentence starting with an ellipsis**
649
+ ```
650
+ 81 procent van de schoten was raak. ...en toen barste de hel los.
651
+ => ["81 procent van de schoten was raak.", "...en toen barste de hel los."]
652
+ ```
653
+
640
654
  ## Comparison of Segmentation Tools, Libraries and Algorithms
641
655
 
642
656
  Name | Programming Language | License | GRS (English) | GRS (Other Languages)† | Speed‡
@@ -657,11 +671,12 @@ Other tools not yet tested:
657
671
  * [FreeLing](http://nlp.lsi.upc.edu/freeling/)
658
672
  * [Alpino](http://www.let.rug.nl/vannoord/alp/Alpino/)
659
673
  * [trtok](https://github.com/jirkamarsik/trainable-tokenizer)
660
- * [segtok](https://pypi.python.org/pypi/segtok/1.1.0)
674
+ * [segtok](https://github.com/fnl/segtok)
661
675
  * [LingPipe](http://alias-i.com/lingpipe/demos/tutorial/sentences/read-me.html)
662
676
  * [Elephant](http://gmb.let.rug.nl/elephant/experiments.php)
663
677
  * [Ucto: Unicode Tokenizer](http://ilk.uvt.nl/ucto/)
664
678
  * [tokenizer](http://moin.delph-in.net/WeSearch/DocumentParsing)
679
+ * [spaCy](http://honnibal.github.io/spaCy/)
665
680
 
666
681
  ## Speed Performance Benchmarks
667
682
 
@@ -779,11 +794,17 @@ To test the relative performance of different segmentation tools and libraries I
779
794
  * Fix bug in splitting new sentence after single quotes
780
795
 
781
796
  **Version 0.2.0**
782
- * Add Dutch Golden rules and abbreviations
797
+ * Add Dutch Golden Rules and abbreviations
783
798
  * Update README with additional tools
784
799
  * Update segmentation test scores in README with results of new Golden Rule tests
785
800
  * Add Polish abbreviations
786
801
 
802
+ **Version 0.3.0**
803
+ * Add support for square brackets
804
+ * Add support for continuous exclamation points or questions marks or combinations of both
805
+ * Fix Roman numeral support
806
+ * Add English abbreviations
807
+
787
808
  ## Contributing
788
809
 
789
810
  If you find a text that is incorrectly segmented using this gem, please submit an issue.
@@ -3,7 +3,7 @@
3
3
  module PragmaticSegmenter
4
4
  # Defines the abbreviations for each language (if available)
5
5
  class Abbreviation
6
- ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
6
+ ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
7
7
  PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
8
8
  NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
9
9
 
@@ -17,6 +17,9 @@ module PragmaticSegmenter
17
17
  # Rubular: http://rubular.com/r/JbAIpKdlSq
18
18
  BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/
19
19
 
20
+ # Rubular: http://rubular.com/r/WX4AvnZvlX
21
+ BETWEEN_SQUARE_BRACKETS_REGEX = /\[(?>[^\]\\]+|\\{2}|\\.)*\]/
22
+
20
23
  # Rubular: http://rubular.com/r/6tTityPflI
21
24
  BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/
22
25
 
@@ -34,6 +37,7 @@ module PragmaticSegmenter
34
37
  def sub_punctuation_between_quotes_and_parens(txt)
35
38
  sub_punctuation_between_single_quotes(txt)
36
39
  sub_punctuation_between_double_quotes(txt)
40
+ sub_punctuation_between_square_brackets(txt)
37
41
  sub_punctuation_between_parens(txt)
38
42
  sub_punctuation_between_quotes_arrow(txt)
39
43
  sub_punctuation_between_quotes_slanted(txt)
@@ -46,6 +50,13 @@ module PragmaticSegmenter
46
50
  ).replace
47
51
  end
48
52
 
53
+ def sub_punctuation_between_square_brackets(txt)
54
+ PragmaticSegmenter::PunctuationReplacer.new(
55
+ matches_array: txt.scan(BETWEEN_SQUARE_BRACKETS_REGEX),
56
+ text: txt
57
+ ).replace
58
+ end
59
+
49
60
  def sub_punctuation_between_single_quotes(txt)
50
61
  PragmaticSegmenter::PunctuationReplacer.new(
51
62
  matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX),
@@ -4,7 +4,7 @@ module PragmaticSegmenter
4
4
  class Process < PragmaticSegmenter::Process
5
5
  private
6
6
 
7
- def between_punctutation(txt)
7
+ def between_punctuation(txt)
8
8
  PragmaticSegmenter::Languages::Deutsch::BetweenPunctuation.new(text: txt).replace
9
9
  end
10
10
 
@@ -4,7 +4,7 @@ module PragmaticSegmenter
4
4
  class Process < PragmaticSegmenter::Process
5
5
  private
6
6
 
7
- def between_punctutation(txt)
7
+ def between_punctuation(txt)
8
8
  PragmaticSegmenter::Languages::Japanese::BetweenPunctuation.new(text: txt).replace
9
9
  end
10
10
  end
@@ -4,6 +4,7 @@ module PragmaticSegmenter
4
4
  # This class searches for a list within a string and adds
5
5
  # newlines before each list item.
6
6
  class List
7
+ ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
7
8
  # Rubular: http://rubular.com/r/XcpaJKH0sz
8
9
  ALPHABETICAL_LIST_WITH_PERIODS =
9
10
  /(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/
@@ -50,6 +51,16 @@ module PragmaticSegmenter
50
51
  format_numbered_list_with_parens(formatted_text)
51
52
  end
52
53
 
54
+ def replace_parens
55
+ ROMAN_NUMERALS.each do |rm|
56
+ next unless text =~ /\(#{Regexp.escape(rm)}\)\s[A-Z]/
57
+ text.gsub!(/\(#{Regexp.escape(rm)}\)(?=\s[A-Z])/) do |match|
58
+ match.gsub!(/\(/, '&✂&').gsub!(/\)/, '&⌬&')
59
+ end
60
+ end
61
+ text
62
+ end
63
+
53
64
  private
54
65
 
55
66
  def format_numbered_list_with_parens(txt)
@@ -171,7 +182,7 @@ module PragmaticSegmenter
171
182
  def iterate_alphabet_array(regex, parens, txt, roman_numeral)
172
183
  list_array = txt.scan(regex).map(&:downcase)
173
184
  if roman_numeral
174
- alphabet = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
185
+ alphabet = ROMAN_NUMERALS
175
186
  else
176
187
  alphabet = ('a'..'z').to_a
177
188
  end
@@ -25,6 +25,9 @@ module PragmaticSegmenter
25
25
  # Rubular: http://rubular.com/r/JMjlZHAT4g
26
26
  SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
27
27
 
28
+ # Rubular: http://rubular.com/r/mQ8Es9bxtk
29
+ CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
30
+
28
31
  attr_reader :text, :doc_type
29
32
  def initialize(text:, doc_type:)
30
33
  @text = text
@@ -35,6 +38,7 @@ module PragmaticSegmenter
35
38
  reformatted_text = PragmaticSegmenter::List.new(text: text).add_line_break
36
39
  reformatted_text = replace_abbreviations(reformatted_text)
37
40
  reformatted_text = replace_numbers(reformatted_text)
41
+ reformatted_text = replace_continuous_punctuation(reformatted_text)
38
42
  reformatted_text.apply(AbbreviationsWithMultiplePeriodsAndEmailRule)
39
43
  reformatted_text.apply(GeoLocationRule)
40
44
  split_into_segments(reformatted_text)
@@ -69,6 +73,13 @@ module PragmaticSegmenter
69
73
  end
70
74
  end
71
75
 
76
+ def replace_continuous_punctuation(txt)
77
+ return txt unless txt =~ CONTINUOUS_PUNCTUATION_REGEX
78
+ txt.gsub!(CONTINUOUS_PUNCTUATION_REGEX) do |match|
79
+ match.gsub!(/!/, '&ᓴ&').gsub!(/\?/, '&ᓷ&')
80
+ end
81
+ end
82
+
72
83
  def consecutive_underscore?(txt)
73
84
  # Rubular: http://rubular.com/r/fTF2Ff3WBL
74
85
  txt.gsub(/_{3,}/, '').length.eql?(0)
@@ -85,12 +96,13 @@ module PragmaticSegmenter
85
96
  def process_text(txt)
86
97
  txt << 'ȸ' unless punctuation_array.any? { |p| txt[-1].include?(p) }
87
98
  PragmaticSegmenter::ExclamationWords.apply_rules(txt)
88
- between_punctutation(txt)
99
+ between_punctuation(txt)
89
100
  txt = txt.apply(
90
- DoublePuctationRules::All,
101
+ DoublePunctuationRules::All,
91
102
  QuestionMarkInQuotationRule,
92
103
  ExclamationPointRules::All
93
104
  )
105
+ txt = PragmaticSegmenter::List.new(text: txt).replace_parens
94
106
  sentence_boundary_punctuation(txt)
95
107
  end
96
108
 
@@ -106,7 +118,7 @@ module PragmaticSegmenter
106
118
  @punct_arr ||= PragmaticSegmenter::Punctuation.new.punct
107
119
  end
108
120
 
109
- def between_punctutation(txt)
121
+ def between_punctuation(txt)
110
122
  PragmaticSegmenter::BetweenPunctuation.new(text: txt).replace
111
123
  end
112
124
 
@@ -28,7 +28,7 @@ module PragmaticSegmenter
28
28
  All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
29
29
  end
30
30
 
31
- module DoublePuctationRules
31
+ module DoublePunctuationRules
32
32
  FirstRule = Rule.new(/\?!/, '☉')
33
33
  SecondRule = Rule.new(/!\?/, '☈')
34
34
  ThirdRule = Rule.new(/\?\?/, '☇')
@@ -64,6 +64,7 @@ module PragmaticSegmenter
64
64
  MixedDoubleEQ = Rule.new(/☈/, '!?')
65
65
  MixedDoubleEE = Rule.new(/☄/, '!!')
66
66
  LeftParens = Rule.new(/&✂&/, '(')
67
+ RightParens = Rule.new(/&⌬&/, ')')
67
68
  TemporaryEndingPunctutation = Rule.new('ȸ', '')
68
69
  Newline = Rule.new(/ȹ/, "\n")
69
70
 
@@ -74,7 +75,7 @@ module PragmaticSegmenter
74
75
  FullWidthQuestionMark, MixedDoubleQE,
75
76
  MixedDoubleQQ, MixedDoubleEQ,
76
77
  MixedDoubleEE, LeftParens,
77
- TemporaryEndingPunctutation,
78
+ RightParens, TemporaryEndingPunctutation,
78
79
  Newline ]
79
80
  end
80
81
 
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -1515,6 +1515,36 @@ RSpec.describe PragmaticSegmenter::Segmenter do
1515
1515
  ps = PragmaticSegmenter::Segmenter.new(text: "Hello. 'This is a test of single quotes.' A new sentence.")
1516
1516
  expect(ps.segment).to eq(["Hello.", "'This is a test of single quotes.'", "A new sentence."])
1517
1517
  end
1518
+
1519
+ it "correctly segments text #099" do
1520
+ ps = PragmaticSegmenter::Segmenter.new(text: "[A sentence in square brackets.]")
1521
+ expect(ps.segment).to eq(["[A sentence in square brackets.]"])
1522
+ end
1523
+
1524
+ it "correctly segments text #100" do
1525
+ ps = PragmaticSegmenter::Segmenter.new(text: "(iii) List item number 3.")
1526
+ expect(ps.segment).to eq(["(iii) List item number 3."])
1527
+ end
1528
+
1529
+ it "correctly segments text #101" do
1530
+ ps = PragmaticSegmenter::Segmenter.new(text: "Unbelievable??!?!")
1531
+ expect(ps.segment).to eq(["Unbelievable??!?!"])
1532
+ end
1533
+
1534
+ it "correctly segments text #102" do
1535
+ ps = PragmaticSegmenter::Segmenter.new(text: "This abbreviation f.e. means for example.")
1536
+ expect(ps.segment).to eq(["This abbreviation f.e. means for example."])
1537
+ end
1538
+
1539
+ it "correctly segments text #103" do
1540
+ ps = PragmaticSegmenter::Segmenter.new(text: "The med. staff here is very kind.")
1541
+ expect(ps.segment).to eq(["The med. staff here is very kind."])
1542
+ end
1543
+
1544
+ it "correctly segments text #104" do
1545
+ ps = PragmaticSegmenter::Segmenter.new(text: "What did you order btw., she wondered.")
1546
+ expect(ps.segment).to eq(["What did you order btw., she wondered."])
1547
+ end
1518
1548
  end
1519
1549
  end
1520
1550
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-26 00:00:00.000000000 Z
11
+ date: 2015-02-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler