pragmatic_segmenter 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +23 -2
- data/lib/pragmatic_segmenter/abbreviation.rb +1 -1
- data/lib/pragmatic_segmenter/between_punctuation.rb +11 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +1 -1
- data/lib/pragmatic_segmenter/languages/japanese.rb +1 -1
- data/lib/pragmatic_segmenter/list.rb +12 -1
- data/lib/pragmatic_segmenter/process.rb +15 -3
- data/lib/pragmatic_segmenter/rules.rb +3 -2
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +30 -0
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1e6be4ac5dd27c491dbbed17b572b6503adb2707
|
|
4
|
+
data.tar.gz: 63d4c42adb13a83e5aa99702a7d7be0a162de0ad
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c21a055c652ffee7f819b79dbf49da0aeb2a068c16b14edbf995ba30f545824c359f61066490b8b5d15ef356df21edc890ea8cfc831fc246eeb293b038868250
|
|
7
|
+
data.tar.gz: 985d1121bc725a65d5b3ace00e1f03f1efa1e9e8edac9ae8657a9b4cb6c736720ed000fa825a3ab9b30fce928cd25f689c6068e3292df1c65c7f3b808fd103a2
|
data/README.md
CHANGED
|
@@ -637,6 +637,20 @@ Hola Srta. Ledesma. Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre
|
|
|
637
637
|
=> ["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"]
|
|
638
638
|
```
|
|
639
639
|
|
|
640
|
+
####Golden Rules (Dutch)
|
|
641
|
+
|
|
642
|
+
1.) **Sentence starting with a number**
|
|
643
|
+
```
|
|
644
|
+
Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen. 81 procent van de schoten was raak.
|
|
645
|
+
=> ["Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen.", "81 procent van de schoten was raak."]
|
|
646
|
+
```
|
|
647
|
+
|
|
648
|
+
2.) **Sentence starting with an ellipsis**
|
|
649
|
+
```
|
|
650
|
+
81 procent van de schoten was raak. ...en toen barste de hel los.
|
|
651
|
+
=> ["81 procent van de schoten was raak.", "...en toen barste de hel los."]
|
|
652
|
+
```
|
|
653
|
+
|
|
640
654
|
## Comparison of Segmentation Tools, Libraries and Algorithms
|
|
641
655
|
|
|
642
656
|
Name | Programming Language | License | GRS (English) | GRS (Other Languages)† | Speed‡
|
|
@@ -657,11 +671,12 @@ Other tools not yet tested:
|
|
|
657
671
|
* [FreeLing](http://nlp.lsi.upc.edu/freeling/)
|
|
658
672
|
* [Alpino](http://www.let.rug.nl/vannoord/alp/Alpino/)
|
|
659
673
|
* [trtok](https://github.com/jirkamarsik/trainable-tokenizer)
|
|
660
|
-
* [segtok](https://
|
|
674
|
+
* [segtok](https://github.com/fnl/segtok)
|
|
661
675
|
* [LingPipe](http://alias-i.com/lingpipe/demos/tutorial/sentences/read-me.html)
|
|
662
676
|
* [Elephant](http://gmb.let.rug.nl/elephant/experiments.php)
|
|
663
677
|
* [Ucto: Unicode Tokenizer](http://ilk.uvt.nl/ucto/)
|
|
664
678
|
* [tokenizer](http://moin.delph-in.net/WeSearch/DocumentParsing)
|
|
679
|
+
* [spaCy](http://honnibal.github.io/spaCy/)
|
|
665
680
|
|
|
666
681
|
## Speed Performance Benchmarks
|
|
667
682
|
|
|
@@ -779,11 +794,17 @@ To test the relative performance of different segmentation tools and libraries I
|
|
|
779
794
|
* Fix bug in splitting new sentence after single quotes
|
|
780
795
|
|
|
781
796
|
**Version 0.2.0**
|
|
782
|
-
* Add Dutch Golden
|
|
797
|
+
* Add Dutch Golden Rules and abbreviations
|
|
783
798
|
* Update README with additional tools
|
|
784
799
|
* Update segmentation test scores in README with results of new Golden Rule tests
|
|
785
800
|
* Add Polish abbreviations
|
|
786
801
|
|
|
802
|
+
**Version 0.3.0**
|
|
803
|
+
* Add support for square brackets
|
|
804
|
+
* Add support for continuous exclamation points or questions marks or combinations of both
|
|
805
|
+
* Fix Roman numeral support
|
|
806
|
+
* Add English abbreviations
|
|
807
|
+
|
|
787
808
|
## Contributing
|
|
788
809
|
|
|
789
810
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
module PragmaticSegmenter
|
|
4
4
|
# Defines the abbreviations for each language (if available)
|
|
5
5
|
class Abbreviation
|
|
6
|
-
ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
|
|
6
|
+
ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
|
|
7
7
|
PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
|
|
8
8
|
NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
|
|
9
9
|
|
|
@@ -17,6 +17,9 @@ module PragmaticSegmenter
|
|
|
17
17
|
# Rubular: http://rubular.com/r/JbAIpKdlSq
|
|
18
18
|
BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/
|
|
19
19
|
|
|
20
|
+
# Rubular: http://rubular.com/r/WX4AvnZvlX
|
|
21
|
+
BETWEEN_SQUARE_BRACKETS_REGEX = /\[(?>[^\]\\]+|\\{2}|\\.)*\]/
|
|
22
|
+
|
|
20
23
|
# Rubular: http://rubular.com/r/6tTityPflI
|
|
21
24
|
BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/
|
|
22
25
|
|
|
@@ -34,6 +37,7 @@ module PragmaticSegmenter
|
|
|
34
37
|
def sub_punctuation_between_quotes_and_parens(txt)
|
|
35
38
|
sub_punctuation_between_single_quotes(txt)
|
|
36
39
|
sub_punctuation_between_double_quotes(txt)
|
|
40
|
+
sub_punctuation_between_square_brackets(txt)
|
|
37
41
|
sub_punctuation_between_parens(txt)
|
|
38
42
|
sub_punctuation_between_quotes_arrow(txt)
|
|
39
43
|
sub_punctuation_between_quotes_slanted(txt)
|
|
@@ -46,6 +50,13 @@ module PragmaticSegmenter
|
|
|
46
50
|
).replace
|
|
47
51
|
end
|
|
48
52
|
|
|
53
|
+
def sub_punctuation_between_square_brackets(txt)
|
|
54
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
|
55
|
+
matches_array: txt.scan(BETWEEN_SQUARE_BRACKETS_REGEX),
|
|
56
|
+
text: txt
|
|
57
|
+
).replace
|
|
58
|
+
end
|
|
59
|
+
|
|
49
60
|
def sub_punctuation_between_single_quotes(txt)
|
|
50
61
|
PragmaticSegmenter::PunctuationReplacer.new(
|
|
51
62
|
matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX),
|
|
@@ -4,6 +4,7 @@ module PragmaticSegmenter
|
|
|
4
4
|
# This class searches for a list within a string and adds
|
|
5
5
|
# newlines before each list item.
|
|
6
6
|
class List
|
|
7
|
+
ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
|
|
7
8
|
# Rubular: http://rubular.com/r/XcpaJKH0sz
|
|
8
9
|
ALPHABETICAL_LIST_WITH_PERIODS =
|
|
9
10
|
/(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/
|
|
@@ -50,6 +51,16 @@ module PragmaticSegmenter
|
|
|
50
51
|
format_numbered_list_with_parens(formatted_text)
|
|
51
52
|
end
|
|
52
53
|
|
|
54
|
+
def replace_parens
|
|
55
|
+
ROMAN_NUMERALS.each do |rm|
|
|
56
|
+
next unless text =~ /\(#{Regexp.escape(rm)}\)\s[A-Z]/
|
|
57
|
+
text.gsub!(/\(#{Regexp.escape(rm)}\)(?=\s[A-Z])/) do |match|
|
|
58
|
+
match.gsub!(/\(/, '&✂&').gsub!(/\)/, '&⌬&')
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
text
|
|
62
|
+
end
|
|
63
|
+
|
|
53
64
|
private
|
|
54
65
|
|
|
55
66
|
def format_numbered_list_with_parens(txt)
|
|
@@ -171,7 +182,7 @@ module PragmaticSegmenter
|
|
|
171
182
|
def iterate_alphabet_array(regex, parens, txt, roman_numeral)
|
|
172
183
|
list_array = txt.scan(regex).map(&:downcase)
|
|
173
184
|
if roman_numeral
|
|
174
|
-
alphabet =
|
|
185
|
+
alphabet = ROMAN_NUMERALS
|
|
175
186
|
else
|
|
176
187
|
alphabet = ('a'..'z').to_a
|
|
177
188
|
end
|
|
@@ -25,6 +25,9 @@ module PragmaticSegmenter
|
|
|
25
25
|
# Rubular: http://rubular.com/r/JMjlZHAT4g
|
|
26
26
|
SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
|
|
27
27
|
|
|
28
|
+
# Rubular: http://rubular.com/r/mQ8Es9bxtk
|
|
29
|
+
CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
|
|
30
|
+
|
|
28
31
|
attr_reader :text, :doc_type
|
|
29
32
|
def initialize(text:, doc_type:)
|
|
30
33
|
@text = text
|
|
@@ -35,6 +38,7 @@ module PragmaticSegmenter
|
|
|
35
38
|
reformatted_text = PragmaticSegmenter::List.new(text: text).add_line_break
|
|
36
39
|
reformatted_text = replace_abbreviations(reformatted_text)
|
|
37
40
|
reformatted_text = replace_numbers(reformatted_text)
|
|
41
|
+
reformatted_text = replace_continuous_punctuation(reformatted_text)
|
|
38
42
|
reformatted_text.apply(AbbreviationsWithMultiplePeriodsAndEmailRule)
|
|
39
43
|
reformatted_text.apply(GeoLocationRule)
|
|
40
44
|
split_into_segments(reformatted_text)
|
|
@@ -69,6 +73,13 @@ module PragmaticSegmenter
|
|
|
69
73
|
end
|
|
70
74
|
end
|
|
71
75
|
|
|
76
|
+
def replace_continuous_punctuation(txt)
|
|
77
|
+
return txt unless txt =~ CONTINUOUS_PUNCTUATION_REGEX
|
|
78
|
+
txt.gsub!(CONTINUOUS_PUNCTUATION_REGEX) do |match|
|
|
79
|
+
match.gsub!(/!/, '&ᓴ&').gsub!(/\?/, '&ᓷ&')
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
72
83
|
def consecutive_underscore?(txt)
|
|
73
84
|
# Rubular: http://rubular.com/r/fTF2Ff3WBL
|
|
74
85
|
txt.gsub(/_{3,}/, '').length.eql?(0)
|
|
@@ -85,12 +96,13 @@ module PragmaticSegmenter
|
|
|
85
96
|
def process_text(txt)
|
|
86
97
|
txt << 'ȸ' unless punctuation_array.any? { |p| txt[-1].include?(p) }
|
|
87
98
|
PragmaticSegmenter::ExclamationWords.apply_rules(txt)
|
|
88
|
-
|
|
99
|
+
between_punctuation(txt)
|
|
89
100
|
txt = txt.apply(
|
|
90
|
-
|
|
101
|
+
DoublePunctuationRules::All,
|
|
91
102
|
QuestionMarkInQuotationRule,
|
|
92
103
|
ExclamationPointRules::All
|
|
93
104
|
)
|
|
105
|
+
txt = PragmaticSegmenter::List.new(text: txt).replace_parens
|
|
94
106
|
sentence_boundary_punctuation(txt)
|
|
95
107
|
end
|
|
96
108
|
|
|
@@ -106,7 +118,7 @@ module PragmaticSegmenter
|
|
|
106
118
|
@punct_arr ||= PragmaticSegmenter::Punctuation.new.punct
|
|
107
119
|
end
|
|
108
120
|
|
|
109
|
-
def
|
|
121
|
+
def between_punctuation(txt)
|
|
110
122
|
PragmaticSegmenter::BetweenPunctuation.new(text: txt).replace
|
|
111
123
|
end
|
|
112
124
|
|
|
@@ -28,7 +28,7 @@ module PragmaticSegmenter
|
|
|
28
28
|
All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
|
|
29
29
|
end
|
|
30
30
|
|
|
31
|
-
module
|
|
31
|
+
module DoublePunctuationRules
|
|
32
32
|
FirstRule = Rule.new(/\?!/, '☉')
|
|
33
33
|
SecondRule = Rule.new(/!\?/, '☈')
|
|
34
34
|
ThirdRule = Rule.new(/\?\?/, '☇')
|
|
@@ -64,6 +64,7 @@ module PragmaticSegmenter
|
|
|
64
64
|
MixedDoubleEQ = Rule.new(/☈/, '!?')
|
|
65
65
|
MixedDoubleEE = Rule.new(/☄/, '!!')
|
|
66
66
|
LeftParens = Rule.new(/&✂&/, '(')
|
|
67
|
+
RightParens = Rule.new(/&⌬&/, ')')
|
|
67
68
|
TemporaryEndingPunctutation = Rule.new('ȸ', '')
|
|
68
69
|
Newline = Rule.new(/ȹ/, "\n")
|
|
69
70
|
|
|
@@ -74,7 +75,7 @@ module PragmaticSegmenter
|
|
|
74
75
|
FullWidthQuestionMark, MixedDoubleQE,
|
|
75
76
|
MixedDoubleQQ, MixedDoubleEQ,
|
|
76
77
|
MixedDoubleEE, LeftParens,
|
|
77
|
-
TemporaryEndingPunctutation,
|
|
78
|
+
RightParens, TemporaryEndingPunctutation,
|
|
78
79
|
Newline ]
|
|
79
80
|
end
|
|
80
81
|
|
|
@@ -1515,6 +1515,36 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
|
1515
1515
|
ps = PragmaticSegmenter::Segmenter.new(text: "Hello. 'This is a test of single quotes.' A new sentence.")
|
|
1516
1516
|
expect(ps.segment).to eq(["Hello.", "'This is a test of single quotes.'", "A new sentence."])
|
|
1517
1517
|
end
|
|
1518
|
+
|
|
1519
|
+
it "correctly segments text #099" do
|
|
1520
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "[A sentence in square brackets.]")
|
|
1521
|
+
expect(ps.segment).to eq(["[A sentence in square brackets.]"])
|
|
1522
|
+
end
|
|
1523
|
+
|
|
1524
|
+
it "correctly segments text #100" do
|
|
1525
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "(iii) List item number 3.")
|
|
1526
|
+
expect(ps.segment).to eq(["(iii) List item number 3."])
|
|
1527
|
+
end
|
|
1528
|
+
|
|
1529
|
+
it "correctly segments text #101" do
|
|
1530
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Unbelievable??!?!")
|
|
1531
|
+
expect(ps.segment).to eq(["Unbelievable??!?!"])
|
|
1532
|
+
end
|
|
1533
|
+
|
|
1534
|
+
it "correctly segments text #102" do
|
|
1535
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "This abbreviation f.e. means for example.")
|
|
1536
|
+
expect(ps.segment).to eq(["This abbreviation f.e. means for example."])
|
|
1537
|
+
end
|
|
1538
|
+
|
|
1539
|
+
it "correctly segments text #103" do
|
|
1540
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "The med. staff here is very kind.")
|
|
1541
|
+
expect(ps.segment).to eq(["The med. staff here is very kind."])
|
|
1542
|
+
end
|
|
1543
|
+
|
|
1544
|
+
it "correctly segments text #104" do
|
|
1545
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "What did you order btw., she wondered.")
|
|
1546
|
+
expect(ps.segment).to eq(["What did you order btw., she wondered."])
|
|
1547
|
+
end
|
|
1518
1548
|
end
|
|
1519
1549
|
end
|
|
1520
1550
|
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: pragmatic_segmenter
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Kevin S. Dias
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2015-
|
|
11
|
+
date: 2015-02-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|