pragmatic_segmenter 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1ea9b87e9654b097486aa24301ec3da33c8b3b79
4
- data.tar.gz: 0db4660da0b5ebd3a1536d2ad29ddc023e09a81d
3
+ metadata.gz: 1e6be4ac5dd27c491dbbed17b572b6503adb2707
4
+ data.tar.gz: 63d4c42adb13a83e5aa99702a7d7be0a162de0ad
5
5
  SHA512:
6
- metadata.gz: 5737824cefe9a0c378e540857b0fc7122c7e89e738f1cb05722e7103346c8683be532b2c42ab21e199624d4f21f9a681511cb383cced542bfad30287d1c8893e
7
- data.tar.gz: 0bfa44c11c70dc1af5b87b55499b6fb2ae306e541e5253335db45fbbc060316d2fa0c78477c36cf574e21ded594f2ccab9e0a6693154816217079df5d495ad16
6
+ metadata.gz: c21a055c652ffee7f819b79dbf49da0aeb2a068c16b14edbf995ba30f545824c359f61066490b8b5d15ef356df21edc890ea8cfc831fc246eeb293b038868250
7
+ data.tar.gz: 985d1121bc725a65d5b3ace00e1f03f1efa1e9e8edac9ae8657a9b4cb6c736720ed000fa825a3ab9b30fce928cd25f689c6068e3292df1c65c7f3b808fd103a2
data/README.md CHANGED
@@ -637,6 +637,20 @@ Hola Srta. Ledesma. Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre
637
637
  => ["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"]
638
638
  ```
639
639
 
640
+ ####Golden Rules (Dutch)
641
+
642
+ 1.) **Sentence starting with a number**
643
+ ```
644
+ Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen. 81 procent van de schoten was raak.
645
+ => ["Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen.", "81 procent van de schoten was raak."]
646
+ ```
647
+
648
+ 2.) **Sentence starting with an ellipsis**
649
+ ```
650
+ 81 procent van de schoten was raak. ...en toen barste de hel los.
651
+ => ["81 procent van de schoten was raak.", "...en toen barste de hel los."]
652
+ ```
653
+
640
654
  ## Comparison of Segmentation Tools, Libraries and Algorithms
641
655
 
642
656
  Name | Programming Language | License | GRS (English) | GRS (Other Languages)† | Speed‡
@@ -657,11 +671,12 @@ Other tools not yet tested:
657
671
  * [FreeLing](http://nlp.lsi.upc.edu/freeling/)
658
672
  * [Alpino](http://www.let.rug.nl/vannoord/alp/Alpino/)
659
673
  * [trtok](https://github.com/jirkamarsik/trainable-tokenizer)
660
- * [segtok](https://pypi.python.org/pypi/segtok/1.1.0)
674
+ * [segtok](https://github.com/fnl/segtok)
661
675
  * [LingPipe](http://alias-i.com/lingpipe/demos/tutorial/sentences/read-me.html)
662
676
  * [Elephant](http://gmb.let.rug.nl/elephant/experiments.php)
663
677
  * [Ucto: Unicode Tokenizer](http://ilk.uvt.nl/ucto/)
664
678
  * [tokenizer](http://moin.delph-in.net/WeSearch/DocumentParsing)
679
+ * [spaCy](http://honnibal.github.io/spaCy/)
665
680
 
666
681
  ## Speed Performance Benchmarks
667
682
 
@@ -779,11 +794,17 @@ To test the relative performance of different segmentation tools and libraries I
779
794
  * Fix bug in splitting new sentence after single quotes
780
795
 
781
796
  **Version 0.2.0**
782
- * Add Dutch Golden rules and abbreviations
797
+ * Add Dutch Golden Rules and abbreviations
783
798
  * Update README with additional tools
784
799
  * Update segmentation test scores in README with results of new Golden Rule tests
785
800
  * Add Polish abbreviations
786
801
 
802
+ **Version 0.3.0**
803
+ * Add support for square brackets
804
+ * Add support for continuous exclamation points or questions marks or combinations of both
805
+ * Fix Roman numeral support
806
+ * Add English abbreviations
807
+
787
808
  ## Contributing
788
809
 
789
810
  If you find a text that is incorrectly segmented using this gem, please submit an issue.
@@ -3,7 +3,7 @@
3
3
  module PragmaticSegmenter
4
4
  # Defines the abbreviations for each language (if available)
5
5
  class Abbreviation
6
- ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
6
+ ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
7
7
  PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
8
8
  NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
9
9
 
@@ -17,6 +17,9 @@ module PragmaticSegmenter
17
17
  # Rubular: http://rubular.com/r/JbAIpKdlSq
18
18
  BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/
19
19
 
20
+ # Rubular: http://rubular.com/r/WX4AvnZvlX
21
+ BETWEEN_SQUARE_BRACKETS_REGEX = /\[(?>[^\]\\]+|\\{2}|\\.)*\]/
22
+
20
23
  # Rubular: http://rubular.com/r/6tTityPflI
21
24
  BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/
22
25
 
@@ -34,6 +37,7 @@ module PragmaticSegmenter
34
37
  def sub_punctuation_between_quotes_and_parens(txt)
35
38
  sub_punctuation_between_single_quotes(txt)
36
39
  sub_punctuation_between_double_quotes(txt)
40
+ sub_punctuation_between_square_brackets(txt)
37
41
  sub_punctuation_between_parens(txt)
38
42
  sub_punctuation_between_quotes_arrow(txt)
39
43
  sub_punctuation_between_quotes_slanted(txt)
@@ -46,6 +50,13 @@ module PragmaticSegmenter
46
50
  ).replace
47
51
  end
48
52
 
53
+ def sub_punctuation_between_square_brackets(txt)
54
+ PragmaticSegmenter::PunctuationReplacer.new(
55
+ matches_array: txt.scan(BETWEEN_SQUARE_BRACKETS_REGEX),
56
+ text: txt
57
+ ).replace
58
+ end
59
+
49
60
  def sub_punctuation_between_single_quotes(txt)
50
61
  PragmaticSegmenter::PunctuationReplacer.new(
51
62
  matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX),
@@ -4,7 +4,7 @@ module PragmaticSegmenter
4
4
  class Process < PragmaticSegmenter::Process
5
5
  private
6
6
 
7
- def between_punctutation(txt)
7
+ def between_punctuation(txt)
8
8
  PragmaticSegmenter::Languages::Deutsch::BetweenPunctuation.new(text: txt).replace
9
9
  end
10
10
 
@@ -4,7 +4,7 @@ module PragmaticSegmenter
4
4
  class Process < PragmaticSegmenter::Process
5
5
  private
6
6
 
7
- def between_punctutation(txt)
7
+ def between_punctuation(txt)
8
8
  PragmaticSegmenter::Languages::Japanese::BetweenPunctuation.new(text: txt).replace
9
9
  end
10
10
  end
@@ -4,6 +4,7 @@ module PragmaticSegmenter
4
4
  # This class searches for a list within a string and adds
5
5
  # newlines before each list item.
6
6
  class List
7
+ ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
7
8
  # Rubular: http://rubular.com/r/XcpaJKH0sz
8
9
  ALPHABETICAL_LIST_WITH_PERIODS =
9
10
  /(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/
@@ -50,6 +51,16 @@ module PragmaticSegmenter
50
51
  format_numbered_list_with_parens(formatted_text)
51
52
  end
52
53
 
54
+ def replace_parens
55
+ ROMAN_NUMERALS.each do |rm|
56
+ next unless text =~ /\(#{Regexp.escape(rm)}\)\s[A-Z]/
57
+ text.gsub!(/\(#{Regexp.escape(rm)}\)(?=\s[A-Z])/) do |match|
58
+ match.gsub!(/\(/, '&✂&').gsub!(/\)/, '&⌬&')
59
+ end
60
+ end
61
+ text
62
+ end
63
+
53
64
  private
54
65
 
55
66
  def format_numbered_list_with_parens(txt)
@@ -171,7 +182,7 @@ module PragmaticSegmenter
171
182
  def iterate_alphabet_array(regex, parens, txt, roman_numeral)
172
183
  list_array = txt.scan(regex).map(&:downcase)
173
184
  if roman_numeral
174
- alphabet = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
185
+ alphabet = ROMAN_NUMERALS
175
186
  else
176
187
  alphabet = ('a'..'z').to_a
177
188
  end
@@ -25,6 +25,9 @@ module PragmaticSegmenter
25
25
  # Rubular: http://rubular.com/r/JMjlZHAT4g
26
26
  SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
27
27
 
28
+ # Rubular: http://rubular.com/r/mQ8Es9bxtk
29
+ CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
30
+
28
31
  attr_reader :text, :doc_type
29
32
  def initialize(text:, doc_type:)
30
33
  @text = text
@@ -35,6 +38,7 @@ module PragmaticSegmenter
35
38
  reformatted_text = PragmaticSegmenter::List.new(text: text).add_line_break
36
39
  reformatted_text = replace_abbreviations(reformatted_text)
37
40
  reformatted_text = replace_numbers(reformatted_text)
41
+ reformatted_text = replace_continuous_punctuation(reformatted_text)
38
42
  reformatted_text.apply(AbbreviationsWithMultiplePeriodsAndEmailRule)
39
43
  reformatted_text.apply(GeoLocationRule)
40
44
  split_into_segments(reformatted_text)
@@ -69,6 +73,13 @@ module PragmaticSegmenter
69
73
  end
70
74
  end
71
75
 
76
+ def replace_continuous_punctuation(txt)
77
+ return txt unless txt =~ CONTINUOUS_PUNCTUATION_REGEX
78
+ txt.gsub!(CONTINUOUS_PUNCTUATION_REGEX) do |match|
79
+ match.gsub!(/!/, '&ᓴ&').gsub!(/\?/, '&ᓷ&')
80
+ end
81
+ end
82
+
72
83
  def consecutive_underscore?(txt)
73
84
  # Rubular: http://rubular.com/r/fTF2Ff3WBL
74
85
  txt.gsub(/_{3,}/, '').length.eql?(0)
@@ -85,12 +96,13 @@ module PragmaticSegmenter
85
96
  def process_text(txt)
86
97
  txt << 'ȸ' unless punctuation_array.any? { |p| txt[-1].include?(p) }
87
98
  PragmaticSegmenter::ExclamationWords.apply_rules(txt)
88
- between_punctutation(txt)
99
+ between_punctuation(txt)
89
100
  txt = txt.apply(
90
- DoublePuctationRules::All,
101
+ DoublePunctuationRules::All,
91
102
  QuestionMarkInQuotationRule,
92
103
  ExclamationPointRules::All
93
104
  )
105
+ txt = PragmaticSegmenter::List.new(text: txt).replace_parens
94
106
  sentence_boundary_punctuation(txt)
95
107
  end
96
108
 
@@ -106,7 +118,7 @@ module PragmaticSegmenter
106
118
  @punct_arr ||= PragmaticSegmenter::Punctuation.new.punct
107
119
  end
108
120
 
109
- def between_punctutation(txt)
121
+ def between_punctuation(txt)
110
122
  PragmaticSegmenter::BetweenPunctuation.new(text: txt).replace
111
123
  end
112
124
 
@@ -28,7 +28,7 @@ module PragmaticSegmenter
28
28
  All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
29
29
  end
30
30
 
31
- module DoublePuctationRules
31
+ module DoublePunctuationRules
32
32
  FirstRule = Rule.new(/\?!/, '☉')
33
33
  SecondRule = Rule.new(/!\?/, '☈')
34
34
  ThirdRule = Rule.new(/\?\?/, '☇')
@@ -64,6 +64,7 @@ module PragmaticSegmenter
64
64
  MixedDoubleEQ = Rule.new(/☈/, '!?')
65
65
  MixedDoubleEE = Rule.new(/☄/, '!!')
66
66
  LeftParens = Rule.new(/&✂&/, '(')
67
+ RightParens = Rule.new(/&⌬&/, ')')
67
68
  TemporaryEndingPunctutation = Rule.new('ȸ', '')
68
69
  Newline = Rule.new(/ȹ/, "\n")
69
70
 
@@ -74,7 +75,7 @@ module PragmaticSegmenter
74
75
  FullWidthQuestionMark, MixedDoubleQE,
75
76
  MixedDoubleQQ, MixedDoubleEQ,
76
77
  MixedDoubleEE, LeftParens,
77
- TemporaryEndingPunctutation,
78
+ RightParens, TemporaryEndingPunctutation,
78
79
  Newline ]
79
80
  end
80
81
 
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -1515,6 +1515,36 @@ RSpec.describe PragmaticSegmenter::Segmenter do
1515
1515
  ps = PragmaticSegmenter::Segmenter.new(text: "Hello. 'This is a test of single quotes.' A new sentence.")
1516
1516
  expect(ps.segment).to eq(["Hello.", "'This is a test of single quotes.'", "A new sentence."])
1517
1517
  end
1518
+
1519
+ it "correctly segments text #099" do
1520
+ ps = PragmaticSegmenter::Segmenter.new(text: "[A sentence in square brackets.]")
1521
+ expect(ps.segment).to eq(["[A sentence in square brackets.]"])
1522
+ end
1523
+
1524
+ it "correctly segments text #100" do
1525
+ ps = PragmaticSegmenter::Segmenter.new(text: "(iii) List item number 3.")
1526
+ expect(ps.segment).to eq(["(iii) List item number 3."])
1527
+ end
1528
+
1529
+ it "correctly segments text #101" do
1530
+ ps = PragmaticSegmenter::Segmenter.new(text: "Unbelievable??!?!")
1531
+ expect(ps.segment).to eq(["Unbelievable??!?!"])
1532
+ end
1533
+
1534
+ it "correctly segments text #102" do
1535
+ ps = PragmaticSegmenter::Segmenter.new(text: "This abbreviation f.e. means for example.")
1536
+ expect(ps.segment).to eq(["This abbreviation f.e. means for example."])
1537
+ end
1538
+
1539
+ it "correctly segments text #103" do
1540
+ ps = PragmaticSegmenter::Segmenter.new(text: "The med. staff here is very kind.")
1541
+ expect(ps.segment).to eq(["The med. staff here is very kind."])
1542
+ end
1543
+
1544
+ it "correctly segments text #104" do
1545
+ ps = PragmaticSegmenter::Segmenter.new(text: "What did you order btw., she wondered.")
1546
+ expect(ps.segment).to eq(["What did you order btw., she wondered."])
1547
+ end
1518
1548
  end
1519
1549
  end
1520
1550
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-26 00:00:00.000000000 Z
11
+ date: 2015-02-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler