pragmatic_segmenter 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -2
- data/lib/pragmatic_segmenter/abbreviation.rb +1 -1
- data/lib/pragmatic_segmenter/between_punctuation.rb +11 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +1 -1
- data/lib/pragmatic_segmenter/languages/japanese.rb +1 -1
- data/lib/pragmatic_segmenter/list.rb +12 -1
- data/lib/pragmatic_segmenter/process.rb +15 -3
- data/lib/pragmatic_segmenter/rules.rb +3 -2
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +30 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1e6be4ac5dd27c491dbbed17b572b6503adb2707
|
4
|
+
data.tar.gz: 63d4c42adb13a83e5aa99702a7d7be0a162de0ad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c21a055c652ffee7f819b79dbf49da0aeb2a068c16b14edbf995ba30f545824c359f61066490b8b5d15ef356df21edc890ea8cfc831fc246eeb293b038868250
|
7
|
+
data.tar.gz: 985d1121bc725a65d5b3ace00e1f03f1efa1e9e8edac9ae8657a9b4cb6c736720ed000fa825a3ab9b30fce928cd25f689c6068e3292df1c65c7f3b808fd103a2
|
data/README.md
CHANGED
@@ -637,6 +637,20 @@ Hola Srta. Ledesma. Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre
|
|
637
637
|
=> ["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"]
|
638
638
|
```
|
639
639
|
|
640
|
+
####Golden Rules (Dutch)
|
641
|
+
|
642
|
+
1.) **Sentence starting with a number**
|
643
|
+
```
|
644
|
+
Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen. 81 procent van de schoten was raak.
|
645
|
+
=> ["Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen.", "81 procent van de schoten was raak."]
|
646
|
+
```
|
647
|
+
|
648
|
+
2.) **Sentence starting with an ellipsis**
|
649
|
+
```
|
650
|
+
81 procent van de schoten was raak. ...en toen barste de hel los.
|
651
|
+
=> ["81 procent van de schoten was raak.", "...en toen barste de hel los."]
|
652
|
+
```
|
653
|
+
|
640
654
|
## Comparison of Segmentation Tools, Libraries and Algorithms
|
641
655
|
|
642
656
|
Name | Programming Language | License | GRS (English) | GRS (Other Languages)† | Speed‡
|
@@ -657,11 +671,12 @@ Other tools not yet tested:
|
|
657
671
|
* [FreeLing](http://nlp.lsi.upc.edu/freeling/)
|
658
672
|
* [Alpino](http://www.let.rug.nl/vannoord/alp/Alpino/)
|
659
673
|
* [trtok](https://github.com/jirkamarsik/trainable-tokenizer)
|
660
|
-
* [segtok](https://
|
674
|
+
* [segtok](https://github.com/fnl/segtok)
|
661
675
|
* [LingPipe](http://alias-i.com/lingpipe/demos/tutorial/sentences/read-me.html)
|
662
676
|
* [Elephant](http://gmb.let.rug.nl/elephant/experiments.php)
|
663
677
|
* [Ucto: Unicode Tokenizer](http://ilk.uvt.nl/ucto/)
|
664
678
|
* [tokenizer](http://moin.delph-in.net/WeSearch/DocumentParsing)
|
679
|
+
* [spaCy](http://honnibal.github.io/spaCy/)
|
665
680
|
|
666
681
|
## Speed Performance Benchmarks
|
667
682
|
|
@@ -779,11 +794,17 @@ To test the relative performance of different segmentation tools and libraries I
|
|
779
794
|
* Fix bug in splitting new sentence after single quotes
|
780
795
|
|
781
796
|
**Version 0.2.0**
|
782
|
-
* Add Dutch Golden
|
797
|
+
* Add Dutch Golden Rules and abbreviations
|
783
798
|
* Update README with additional tools
|
784
799
|
* Update segmentation test scores in README with results of new Golden Rule tests
|
785
800
|
* Add Polish abbreviations
|
786
801
|
|
802
|
+
**Version 0.3.0**
|
803
|
+
* Add support for square brackets
|
804
|
+
* Add support for continuous exclamation points or questions marks or combinations of both
|
805
|
+
* Fix Roman numeral support
|
806
|
+
* Add English abbreviations
|
807
|
+
|
787
808
|
## Contributing
|
788
809
|
|
789
810
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -3,7 +3,7 @@
|
|
3
3
|
module PragmaticSegmenter
|
4
4
|
# Defines the abbreviations for each language (if available)
|
5
5
|
class Abbreviation
|
6
|
-
ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
|
6
|
+
ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
|
7
7
|
PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
|
8
8
|
NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
|
9
9
|
|
@@ -17,6 +17,9 @@ module PragmaticSegmenter
|
|
17
17
|
# Rubular: http://rubular.com/r/JbAIpKdlSq
|
18
18
|
BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/
|
19
19
|
|
20
|
+
# Rubular: http://rubular.com/r/WX4AvnZvlX
|
21
|
+
BETWEEN_SQUARE_BRACKETS_REGEX = /\[(?>[^\]\\]+|\\{2}|\\.)*\]/
|
22
|
+
|
20
23
|
# Rubular: http://rubular.com/r/6tTityPflI
|
21
24
|
BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/
|
22
25
|
|
@@ -34,6 +37,7 @@ module PragmaticSegmenter
|
|
34
37
|
def sub_punctuation_between_quotes_and_parens(txt)
|
35
38
|
sub_punctuation_between_single_quotes(txt)
|
36
39
|
sub_punctuation_between_double_quotes(txt)
|
40
|
+
sub_punctuation_between_square_brackets(txt)
|
37
41
|
sub_punctuation_between_parens(txt)
|
38
42
|
sub_punctuation_between_quotes_arrow(txt)
|
39
43
|
sub_punctuation_between_quotes_slanted(txt)
|
@@ -46,6 +50,13 @@ module PragmaticSegmenter
|
|
46
50
|
).replace
|
47
51
|
end
|
48
52
|
|
53
|
+
def sub_punctuation_between_square_brackets(txt)
|
54
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
55
|
+
matches_array: txt.scan(BETWEEN_SQUARE_BRACKETS_REGEX),
|
56
|
+
text: txt
|
57
|
+
).replace
|
58
|
+
end
|
59
|
+
|
49
60
|
def sub_punctuation_between_single_quotes(txt)
|
50
61
|
PragmaticSegmenter::PunctuationReplacer.new(
|
51
62
|
matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX),
|
@@ -4,6 +4,7 @@ module PragmaticSegmenter
|
|
4
4
|
# This class searches for a list within a string and adds
|
5
5
|
# newlines before each list item.
|
6
6
|
class List
|
7
|
+
ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
|
7
8
|
# Rubular: http://rubular.com/r/XcpaJKH0sz
|
8
9
|
ALPHABETICAL_LIST_WITH_PERIODS =
|
9
10
|
/(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/
|
@@ -50,6 +51,16 @@ module PragmaticSegmenter
|
|
50
51
|
format_numbered_list_with_parens(formatted_text)
|
51
52
|
end
|
52
53
|
|
54
|
+
def replace_parens
|
55
|
+
ROMAN_NUMERALS.each do |rm|
|
56
|
+
next unless text =~ /\(#{Regexp.escape(rm)}\)\s[A-Z]/
|
57
|
+
text.gsub!(/\(#{Regexp.escape(rm)}\)(?=\s[A-Z])/) do |match|
|
58
|
+
match.gsub!(/\(/, '&✂&').gsub!(/\)/, '&⌬&')
|
59
|
+
end
|
60
|
+
end
|
61
|
+
text
|
62
|
+
end
|
63
|
+
|
53
64
|
private
|
54
65
|
|
55
66
|
def format_numbered_list_with_parens(txt)
|
@@ -171,7 +182,7 @@ module PragmaticSegmenter
|
|
171
182
|
def iterate_alphabet_array(regex, parens, txt, roman_numeral)
|
172
183
|
list_array = txt.scan(regex).map(&:downcase)
|
173
184
|
if roman_numeral
|
174
|
-
alphabet =
|
185
|
+
alphabet = ROMAN_NUMERALS
|
175
186
|
else
|
176
187
|
alphabet = ('a'..'z').to_a
|
177
188
|
end
|
@@ -25,6 +25,9 @@ module PragmaticSegmenter
|
|
25
25
|
# Rubular: http://rubular.com/r/JMjlZHAT4g
|
26
26
|
SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
|
27
27
|
|
28
|
+
# Rubular: http://rubular.com/r/mQ8Es9bxtk
|
29
|
+
CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
|
30
|
+
|
28
31
|
attr_reader :text, :doc_type
|
29
32
|
def initialize(text:, doc_type:)
|
30
33
|
@text = text
|
@@ -35,6 +38,7 @@ module PragmaticSegmenter
|
|
35
38
|
reformatted_text = PragmaticSegmenter::List.new(text: text).add_line_break
|
36
39
|
reformatted_text = replace_abbreviations(reformatted_text)
|
37
40
|
reformatted_text = replace_numbers(reformatted_text)
|
41
|
+
reformatted_text = replace_continuous_punctuation(reformatted_text)
|
38
42
|
reformatted_text.apply(AbbreviationsWithMultiplePeriodsAndEmailRule)
|
39
43
|
reformatted_text.apply(GeoLocationRule)
|
40
44
|
split_into_segments(reformatted_text)
|
@@ -69,6 +73,13 @@ module PragmaticSegmenter
|
|
69
73
|
end
|
70
74
|
end
|
71
75
|
|
76
|
+
def replace_continuous_punctuation(txt)
|
77
|
+
return txt unless txt =~ CONTINUOUS_PUNCTUATION_REGEX
|
78
|
+
txt.gsub!(CONTINUOUS_PUNCTUATION_REGEX) do |match|
|
79
|
+
match.gsub!(/!/, '&ᓴ&').gsub!(/\?/, '&ᓷ&')
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
72
83
|
def consecutive_underscore?(txt)
|
73
84
|
# Rubular: http://rubular.com/r/fTF2Ff3WBL
|
74
85
|
txt.gsub(/_{3,}/, '').length.eql?(0)
|
@@ -85,12 +96,13 @@ module PragmaticSegmenter
|
|
85
96
|
def process_text(txt)
|
86
97
|
txt << 'ȸ' unless punctuation_array.any? { |p| txt[-1].include?(p) }
|
87
98
|
PragmaticSegmenter::ExclamationWords.apply_rules(txt)
|
88
|
-
|
99
|
+
between_punctuation(txt)
|
89
100
|
txt = txt.apply(
|
90
|
-
|
101
|
+
DoublePunctuationRules::All,
|
91
102
|
QuestionMarkInQuotationRule,
|
92
103
|
ExclamationPointRules::All
|
93
104
|
)
|
105
|
+
txt = PragmaticSegmenter::List.new(text: txt).replace_parens
|
94
106
|
sentence_boundary_punctuation(txt)
|
95
107
|
end
|
96
108
|
|
@@ -106,7 +118,7 @@ module PragmaticSegmenter
|
|
106
118
|
@punct_arr ||= PragmaticSegmenter::Punctuation.new.punct
|
107
119
|
end
|
108
120
|
|
109
|
-
def
|
121
|
+
def between_punctuation(txt)
|
110
122
|
PragmaticSegmenter::BetweenPunctuation.new(text: txt).replace
|
111
123
|
end
|
112
124
|
|
@@ -28,7 +28,7 @@ module PragmaticSegmenter
|
|
28
28
|
All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
|
29
29
|
end
|
30
30
|
|
31
|
-
module
|
31
|
+
module DoublePunctuationRules
|
32
32
|
FirstRule = Rule.new(/\?!/, '☉')
|
33
33
|
SecondRule = Rule.new(/!\?/, '☈')
|
34
34
|
ThirdRule = Rule.new(/\?\?/, '☇')
|
@@ -64,6 +64,7 @@ module PragmaticSegmenter
|
|
64
64
|
MixedDoubleEQ = Rule.new(/☈/, '!?')
|
65
65
|
MixedDoubleEE = Rule.new(/☄/, '!!')
|
66
66
|
LeftParens = Rule.new(/&✂&/, '(')
|
67
|
+
RightParens = Rule.new(/&⌬&/, ')')
|
67
68
|
TemporaryEndingPunctutation = Rule.new('ȸ', '')
|
68
69
|
Newline = Rule.new(/ȹ/, "\n")
|
69
70
|
|
@@ -74,7 +75,7 @@ module PragmaticSegmenter
|
|
74
75
|
FullWidthQuestionMark, MixedDoubleQE,
|
75
76
|
MixedDoubleQQ, MixedDoubleEQ,
|
76
77
|
MixedDoubleEE, LeftParens,
|
77
|
-
TemporaryEndingPunctutation,
|
78
|
+
RightParens, TemporaryEndingPunctutation,
|
78
79
|
Newline ]
|
79
80
|
end
|
80
81
|
|
@@ -1515,6 +1515,36 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
1515
1515
|
ps = PragmaticSegmenter::Segmenter.new(text: "Hello. 'This is a test of single quotes.' A new sentence.")
|
1516
1516
|
expect(ps.segment).to eq(["Hello.", "'This is a test of single quotes.'", "A new sentence."])
|
1517
1517
|
end
|
1518
|
+
|
1519
|
+
it "correctly segments text #099" do
|
1520
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "[A sentence in square brackets.]")
|
1521
|
+
expect(ps.segment).to eq(["[A sentence in square brackets.]"])
|
1522
|
+
end
|
1523
|
+
|
1524
|
+
it "correctly segments text #100" do
|
1525
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "(iii) List item number 3.")
|
1526
|
+
expect(ps.segment).to eq(["(iii) List item number 3."])
|
1527
|
+
end
|
1528
|
+
|
1529
|
+
it "correctly segments text #101" do
|
1530
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Unbelievable??!?!")
|
1531
|
+
expect(ps.segment).to eq(["Unbelievable??!?!"])
|
1532
|
+
end
|
1533
|
+
|
1534
|
+
it "correctly segments text #102" do
|
1535
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "This abbreviation f.e. means for example.")
|
1536
|
+
expect(ps.segment).to eq(["This abbreviation f.e. means for example."])
|
1537
|
+
end
|
1538
|
+
|
1539
|
+
it "correctly segments text #103" do
|
1540
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "The med. staff here is very kind.")
|
1541
|
+
expect(ps.segment).to eq(["The med. staff here is very kind."])
|
1542
|
+
end
|
1543
|
+
|
1544
|
+
it "correctly segments text #104" do
|
1545
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "What did you order btw., she wondered.")
|
1546
|
+
expect(ps.segment).to eq(["What did you order btw., she wondered."])
|
1547
|
+
end
|
1518
1548
|
end
|
1519
1549
|
end
|
1520
1550
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-02-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|