pragmatic_segmenter 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +8 -2
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +16 -51
- data/lib/pragmatic_segmenter/cleaner.rb +18 -99
- data/lib/pragmatic_segmenter/languages.rb +62 -0
- data/lib/pragmatic_segmenter/languages/amharic.rb +4 -30
- data/lib/pragmatic_segmenter/languages/arabic.rb +21 -64
- data/lib/pragmatic_segmenter/languages/armenian.rb +4 -30
- data/lib/pragmatic_segmenter/languages/burmese.rb +4 -30
- data/lib/pragmatic_segmenter/languages/chinese.rb +8 -0
- data/lib/pragmatic_segmenter/languages/common.rb +70 -1
- data/lib/pragmatic_segmenter/languages/deutsch.rb +49 -78
- data/lib/pragmatic_segmenter/languages/dutch.rb +5 -36
- data/lib/pragmatic_segmenter/languages/english.rb +3 -12
- data/lib/pragmatic_segmenter/languages/french.rb +5 -32
- data/lib/pragmatic_segmenter/languages/greek.rb +4 -26
- data/lib/pragmatic_segmenter/languages/hindi.rb +4 -30
- data/lib/pragmatic_segmenter/languages/italian.rb +3 -37
- data/lib/pragmatic_segmenter/languages/japanese.rb +6 -4
- data/lib/pragmatic_segmenter/languages/persian.rb +16 -40
- data/lib/pragmatic_segmenter/languages/polish.rb +6 -38
- data/lib/pragmatic_segmenter/languages/russian.rb +13 -33
- data/lib/pragmatic_segmenter/languages/spanish.rb +6 -31
- data/lib/pragmatic_segmenter/languages/urdu.rb +4 -30
- data/lib/pragmatic_segmenter/number.rb +5 -5
- data/lib/pragmatic_segmenter/process.rb +28 -49
- data/lib/pragmatic_segmenter/rules.rb +65 -1
- data/lib/pragmatic_segmenter/{ellipsis.rb → rules/ellipsis.rb} +0 -0
- data/lib/pragmatic_segmenter/rules/html.rb +13 -0
- data/lib/pragmatic_segmenter/segmenter.rb +12 -32
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +6 -7
- metadata +6 -8
- data/lib/pragmatic_segmenter/abbreviation.rb +0 -22
- data/lib/pragmatic_segmenter/language_support.rb +0 -31
- data/lib/pragmatic_segmenter/punctuation.rb +0 -12
- data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +0 -17
- data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +0 -37
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7e52d1869830dfba91d5e5a00f15d3529081691e
|
4
|
+
data.tar.gz: 5754723c0ba657a31e3471d785b034e3c1814e33
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d3a4700ba0369b60d36f633c4e042a5113197956ad02dff64243c70445bc445435af17689f456327753cead2d9d1cee00db548e2e0dad337341f110194522d2c
|
7
|
+
data.tar.gz: f8a28d6582e846ad2ffff21b87f4888b922cc4224faaf85e2782ca92671506dbf55fab86c0ddf06ae6363a3851f8ec9dd44f7bc9956a86f2ed2aea4bc6d69fb5
|
data/README.md
CHANGED
@@ -21,7 +21,7 @@ gem 'pragmatic_segmenter'
|
|
21
21
|
##Usage
|
22
22
|
|
23
23
|
* If no language is specified, the library will default to English.
|
24
|
-
* To specify a language use its two
|
24
|
+
* To specify a language use its two character [ISO 639-1 code](https://www.tm-town.com/languages).
|
25
25
|
|
26
26
|
```ruby
|
27
27
|
text = "Hello world. My name is Mr. Smith. I work for the U.S. Government and I live in the U.S. I live in New York."
|
@@ -60,7 +60,7 @@ Try out a [live demo](https://www.tm-town.com/natural-language-processing) of Pr
|
|
60
60
|
|
61
61
|
##Background
|
62
62
|
|
63
|
-
According to Wikipedia, [sentence boundary disambiguation](http://en.wikipedia.org/wiki/Sentence_boundary_disambiguation) (aka sentence boundary
|
63
|
+
According to Wikipedia, [sentence boundary disambiguation](http://en.wikipedia.org/wiki/Sentence_boundary_disambiguation) (aka sentence boundary detection, sentence segmentation) is defined as:
|
64
64
|
|
65
65
|
> Sentence boundary disambiguation (SBD), also known as sentence breaking, is the problem in natural language processing of deciding where sentences begin and end. Often natural language processing tools require their input to be divided into sentences for a number of reasons. However sentence boundary identification is challenging because punctuation marks are often ambiguous. For example, a period may denote an abbreviation, decimal point, an ellipsis, or an email address – not the end of a sentence. About 47% of the periods in the Wall Street Journal corpus denote abbreviations. As well, question marks and exclamation marks may appear in embedded quotations, emoticons, computer code, and slang. Languages like Japanese and Chinese have unambiguous sentence-ending markers.
|
66
66
|
|
@@ -677,6 +677,9 @@ Other tools not yet tested:
|
|
677
677
|
* [Ucto: Unicode Tokenizer](http://ilk.uvt.nl/ucto/)
|
678
678
|
* [tokenizer](http://moin.delph-in.net/WeSearch/DocumentParsing)
|
679
679
|
* [spaCy](http://honnibal.github.io/spaCy/)
|
680
|
+
* [GATE](https://gate.ac.uk/)
|
681
|
+
* [University of Illinois Sentence Segmentation tool](http://cogcomp.cs.illinois.edu/page/tools_view/2)
|
682
|
+
* [DetectorMorse](https://github.com/cslu-nlp/detectormorse)
|
680
683
|
|
681
684
|
## Speed Performance Benchmarks
|
682
685
|
|
@@ -810,6 +813,9 @@ To test the relative performance of different segmentation tools and libraries I
|
|
810
813
|
**Version 0.3.1**
|
811
814
|
* Fix undefined method 'gsub!' for nil:NilClass issue
|
812
815
|
|
816
|
+
**Version 0.3.2**
|
817
|
+
* Add English abbreviations
|
818
|
+
|
813
819
|
## Contributing
|
814
820
|
|
815
821
|
If you find a text that is incorrectly segmented using this gem, please submit an issue.
|
@@ -1,75 +1,51 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
-
require 'pragmatic_segmenter/abbreviation'
|
3
|
-
require 'pragmatic_segmenter/single_letter_abbreviation'
|
4
2
|
|
5
3
|
module PragmaticSegmenter
|
6
4
|
# This class searches for periods within an abbreviation and
|
7
5
|
# replaces the periods.
|
8
6
|
class AbbreviationReplacer
|
9
|
-
# Rubular: http://rubular.com/r/yqa4Rit8EY
|
10
|
-
PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
|
11
|
-
|
12
|
-
# Rubular: http://rubular.com/r/NEv265G2X2
|
13
|
-
KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '∯')
|
14
|
-
|
15
|
-
# Rubular: http://rubular.com/r/xDkpFZ0EgH
|
16
|
-
MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
|
17
|
-
|
18
|
-
module AmPmRules
|
19
|
-
# Rubular: http://rubular.com/r/Vnx3m4Spc8
|
20
|
-
UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
|
21
|
-
|
22
|
-
# Rubular: http://rubular.com/r/AJMCotJVbW
|
23
|
-
UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
|
24
|
-
|
25
|
-
# Rubular: http://rubular.com/r/13q7SnOhgA
|
26
|
-
LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
|
27
|
-
|
28
|
-
# Rubular: http://rubular.com/r/DgUDq4mLz5
|
29
|
-
LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
|
30
|
-
|
31
|
-
All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
|
32
|
-
end
|
33
7
|
|
34
8
|
SENTENCE_STARTERS = %w(A Being Did For He How However I In It Millions More She That The There They We What When Where Who Why)
|
35
9
|
|
36
10
|
attr_reader :text
|
37
|
-
def initialize(text:)
|
11
|
+
def initialize(text:, language: Languages::Common)
|
38
12
|
@text = Text.new(text)
|
13
|
+
@language = language
|
39
14
|
end
|
40
15
|
|
41
16
|
def replace
|
42
|
-
@reformatted_text = text.apply(PossessiveAbbreviationRule
|
43
|
-
|
44
|
-
|
45
|
-
|
17
|
+
@reformatted_text = text.apply(@language::PossessiveAbbreviationRule,
|
18
|
+
@language::KommanditgesellschaftRule,
|
19
|
+
@language::SingleLetterAbbreviationRules::All)
|
20
|
+
|
21
|
+
@reformatted_text = search_for_abbreviations_in_string(@reformatted_text)
|
46
22
|
@reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
|
47
|
-
@reformatted_text = @reformatted_text.apply(AmPmRules::All)
|
23
|
+
@reformatted_text = @reformatted_text.apply(@language::AmPmRules::All)
|
48
24
|
replace_abbreviation_as_sentence_boundary(@reformatted_text)
|
49
25
|
end
|
50
26
|
|
51
27
|
private
|
52
28
|
|
53
|
-
def search_for_abbreviations_in_string(txt
|
29
|
+
def search_for_abbreviations_in_string(txt)
|
54
30
|
original = txt.dup
|
55
31
|
downcased = txt.downcase
|
56
|
-
|
32
|
+
@language::Abbreviation::ABBREVIATIONS.each do |a|
|
57
33
|
next unless downcased.include?(a.strip)
|
58
34
|
abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(a.strip)}/i)
|
59
35
|
next if abbrev_match.empty?
|
60
36
|
next_word_start = /(?<=#{Regexp.escape(a.strip)} ).{1}/
|
61
37
|
character_array = @text.scan(next_word_start)
|
62
38
|
abbrev_match.each_with_index do |am, index|
|
63
|
-
txt = scan_for_replacements(txt, am, index, character_array
|
39
|
+
txt = scan_for_replacements(txt, am, index, character_array)
|
64
40
|
end
|
65
41
|
end
|
66
42
|
txt
|
67
43
|
end
|
68
44
|
|
69
|
-
def scan_for_replacements(txt, am, index, character_array
|
45
|
+
def scan_for_replacements(txt, am, index, character_array)
|
70
46
|
character = character_array[index]
|
71
|
-
prepositive =
|
72
|
-
number_abbr =
|
47
|
+
prepositive = @language::Abbreviation::PREPOSITIVE_ABBREVIATIONS
|
48
|
+
number_abbr = @language::Abbreviation::NUMBER_ABBREVIATIONS
|
73
49
|
upper = /[[:upper:]]/.match(character.to_s)
|
74
50
|
if upper.nil? || prepositive.include?(am.downcase.strip)
|
75
51
|
if prepositive.include?(am.downcase.strip)
|
@@ -83,10 +59,6 @@ module PragmaticSegmenter
|
|
83
59
|
txt
|
84
60
|
end
|
85
61
|
|
86
|
-
def abbreviations
|
87
|
-
@abbr ||= PragmaticSegmenter::Abbreviation.new
|
88
|
-
end
|
89
|
-
|
90
62
|
def replace_abbreviation_as_sentence_boundary(txt)
|
91
63
|
# As we are being conservative and keeping ambiguous
|
92
64
|
# sentence boundaries as one sentence instead of
|
@@ -120,7 +92,7 @@ module PragmaticSegmenter
|
|
120
92
|
end
|
121
93
|
|
122
94
|
def replace_multi_period_abbreviations(txt)
|
123
|
-
mpa = txt.scan(MULTI_PERIOD_ABBREVIATION_REGEX)
|
95
|
+
mpa = txt.scan(@language::MULTI_PERIOD_ABBREVIATION_REGEX)
|
124
96
|
return txt if mpa.empty?
|
125
97
|
mpa.each do |r|
|
126
98
|
txt = txt.gsub(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
|
@@ -128,13 +100,6 @@ module PragmaticSegmenter
|
|
128
100
|
txt
|
129
101
|
end
|
130
102
|
|
131
|
-
def replace_period_in_am_pm(txt)
|
132
|
-
txt.gsub(UPPERCASE_PM_REGEX, '.')
|
133
|
-
.gsub(UPPERCASE_AM_REGEX, '.')
|
134
|
-
.gsub(LOWERCASE_PM_REGEX, '.')
|
135
|
-
.gsub(LOWERCASE_AM_REGEX, '.')
|
136
|
-
end
|
137
|
-
|
138
103
|
def replace_pre_number_abbr(txt, abbr)
|
139
104
|
txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
|
140
105
|
.gsub(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
|
@@ -152,7 +117,7 @@ module PragmaticSegmenter
|
|
152
117
|
end
|
153
118
|
|
154
119
|
def replace_possessive_abbreviations(txt)
|
155
|
-
txt.gsub(POSSESSIVE_ABBREVIATION_REGEX, '∯')
|
120
|
+
txt.gsub(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
|
156
121
|
end
|
157
122
|
end
|
158
123
|
end
|
@@ -1,87 +1,16 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
2
|
|
3
3
|
module PragmaticSegmenter
|
4
|
-
module Rules
|
5
|
-
module HtmlRules
|
6
|
-
# Rubular: http://rubular.com/r/ENrVFMdJ8v
|
7
|
-
HTMLTagRule = Rule.new(/<\/?[^>]*>/, '')
|
8
|
-
|
9
|
-
# Rubular: http://rubular.com/r/XZVqMPJhea
|
10
|
-
EscapedHTMLTagRule = Rule.new(/<\/?[^gt;]*gt;/, '')
|
11
|
-
|
12
|
-
All = [HTMLTagRule, EscapedHTMLTagRule]
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
4
|
# This is an opinionated class that removes errant newlines,
|
17
5
|
# xhtml, inline formatting, etc.
|
18
6
|
class Cleaner
|
19
7
|
include Rules
|
20
|
-
URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']
|
21
|
-
|
22
|
-
# Rubular: http://rubular.com/r/6dt98uI76u
|
23
|
-
NO_SPACE_BETWEEN_SENTENCES_REGEX = /(?<=[a-z])\.(?=[A-Z])/
|
24
|
-
|
25
|
-
# Rubular: http://rubular.com/r/l6KN6rH5XE
|
26
|
-
NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = /(?<=\d)\.(?=[A-Z])/
|
27
|
-
|
28
|
-
# Rubular: http://rubular.com/r/V57WnM9Zut
|
29
|
-
NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
|
30
|
-
|
31
|
-
# Rubular: http://rubular.com/r/3GiRiP2IbD
|
32
|
-
NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = /(?<=\s)\n(?=([a-z]|\())/
|
33
|
-
|
34
|
-
# Rubular: http://rubular.com/r/UZAVcwqck8
|
35
|
-
PDF_NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '')
|
36
|
-
|
37
|
-
# Rubular: http://rubular.com/r/eaNwGavmdo
|
38
|
-
PDF_NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ')
|
39
|
-
|
40
|
-
# Rubular: http://rubular.com/r/bAJrhyLNeZ
|
41
|
-
InlineFormattingRule = Rule.new(/\{b\^>\d*<b\^\}|\{b\^>\d*<b\^\}/, '')
|
42
|
-
|
43
|
-
# Rubular: http://rubular.com/r/dMxp5MixFS
|
44
|
-
DoubleNewLineWithSpaceRule = Rule.new(/\n \n/, "\r")
|
45
|
-
|
46
|
-
# Rubular: http://rubular.com/r/H6HOJeA8bq
|
47
|
-
DoubleNewLineRule = Rule.new(/\n\n/, "\r")
|
48
|
-
|
49
|
-
# Rubular: http://rubular.com/r/Gn18aAnLdZ
|
50
|
-
NewLineFollowedByBulletRule = Rule.new(/\n(?=•)/, "\r")
|
51
|
-
|
52
|
-
# Rubular: http://rubular.com/r/FseyMiiYFT
|
53
|
-
NewLineFollowedByPeriodRule = Rule.new(/\n(?=\.(\s|\n))/, '')
|
54
|
-
|
55
|
-
# Rubular: http://rubular.com/r/8mc1ArOIGy
|
56
|
-
TableOfContentsRule = Rule.new(/\.{5,}\s*\d+-*\d*/, "\r")
|
57
|
-
|
58
|
-
# Rubular: http://rubular.com/r/DwNSuZrNtk
|
59
|
-
ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
|
60
|
-
|
61
|
-
# Rubular: http://rubular.com/r/IQ4TPfsbd8
|
62
|
-
ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
|
63
|
-
|
64
|
-
# Rubular: http://rubular.com/r/6dt98uI76u
|
65
|
-
NoSpaceBetweenSentencesRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
|
66
|
-
|
67
|
-
# Rubular: http://rubular.com/r/l6KN6rH5XE
|
68
|
-
NoSpaceBetweenSentencesDigitRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
|
69
|
-
|
70
|
-
EscapedCarriageReturnRule = Rule.new(/\\r/, "\r")
|
71
|
-
TypoEscapedCarriageReturnRule = Rule.new(/\\\ r/, "\r")
|
72
|
-
|
73
|
-
EscapedNewLineRule = Rule.new(/\\n/, "\n")
|
74
|
-
TypoEscapedNewLineRule = Rule.new(/\\\ n/, "\n")
|
75
|
-
|
76
|
-
ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
|
77
|
-
|
78
|
-
QuotationsFirstRule = Rule.new(/''/, '"')
|
79
|
-
QuotationsSecondRule = Rule.new(/``/, '"')
|
80
8
|
|
81
9
|
attr_reader :text, :doc_type
|
82
|
-
def initialize(text:, **args)
|
10
|
+
def initialize(text:, doc_type: nil, language: Languages::Common, **args)
|
83
11
|
@text = Text.new(text.dup)
|
84
|
-
@doc_type =
|
12
|
+
@doc_type = doc_type
|
13
|
+
@language = language
|
85
14
|
end
|
86
15
|
|
87
16
|
# Clean text of unwanted formatting
|
@@ -94,7 +23,7 @@ module PragmaticSegmenter
|
|
94
23
|
# Arguments:
|
95
24
|
# text: (String) *required
|
96
25
|
# language: (String) *optional
|
97
|
-
# (two
|
26
|
+
# (two character ISO 639-1 code e.g. 'en')
|
98
27
|
# doc_type: (String) *optional
|
99
28
|
# (e.g. 'pdf')
|
100
29
|
|
@@ -104,7 +33,7 @@ module PragmaticSegmenter
|
|
104
33
|
replace_double_newlines(@clean_text)
|
105
34
|
replace_newlines(@clean_text)
|
106
35
|
replace_escaped_newlines(@clean_text)
|
107
|
-
@clean_text.apply(
|
36
|
+
@clean_text.apply(HTMLRules::All)
|
108
37
|
replace_punctuation_in_brackets(@clean_text)
|
109
38
|
@clean_text.apply(InlineFormattingRule)
|
110
39
|
clean_quotations(@clean_text)
|
@@ -141,10 +70,6 @@ module PragmaticSegmenter
|
|
141
70
|
end
|
142
71
|
end
|
143
72
|
|
144
|
-
def abbreviations
|
145
|
-
@abbr ||= PragmaticSegmenter::Abbreviation.new.all
|
146
|
-
end
|
147
|
-
|
148
73
|
def remove_all_newlines(txt)
|
149
74
|
clean_text = remove_newline_in_middle_of_sentence(txt)
|
150
75
|
remove_newline_in_middle_of_word(clean_text)
|
@@ -161,50 +86,44 @@ module PragmaticSegmenter
|
|
161
86
|
end
|
162
87
|
|
163
88
|
def remove_newline_in_middle_of_word(txt)
|
164
|
-
txt.apply
|
89
|
+
txt.apply NewLineInMiddleOfWordRule
|
165
90
|
end
|
166
91
|
|
167
92
|
def replace_escaped_newlines(txt)
|
168
|
-
txt.apply
|
169
|
-
|
170
|
-
apply(TypoEscapedNewLineRule).
|
171
|
-
apply(TypoEscapedCarriageReturnRule)
|
93
|
+
txt.apply EscapedNewLineRule, EscapedCarriageReturnRule,
|
94
|
+
TypoEscapedNewLineRule, TypoEscapedCarriageReturnRule
|
172
95
|
end
|
173
96
|
|
174
97
|
def replace_double_newlines(txt)
|
175
|
-
txt.apply
|
176
|
-
apply(DoubleNewLineRule)
|
98
|
+
txt.apply DoubleNewLineWithSpaceRule, DoubleNewLineRule
|
177
99
|
end
|
178
100
|
|
179
101
|
def replace_newlines(txt)
|
180
102
|
if doc_type.eql?('pdf')
|
181
103
|
remove_pdf_line_breaks(txt)
|
182
104
|
else
|
183
|
-
txt.apply
|
184
|
-
|
105
|
+
txt.apply NewLineFollowedByPeriodRule,
|
106
|
+
ReplaceNewlineWithCarriageReturnRule
|
185
107
|
end
|
186
108
|
end
|
187
109
|
|
188
110
|
def remove_pdf_line_breaks(txt)
|
189
|
-
txt.apply
|
190
|
-
|
191
|
-
|
111
|
+
txt.apply NewLineFollowedByBulletRule,
|
112
|
+
PDF_NewLineInMiddleOfSentenceRule,
|
113
|
+
PDF_NewLineInMiddleOfSentenceNoSpacesRule
|
192
114
|
end
|
193
115
|
|
194
116
|
def clean_quotations(txt)
|
195
|
-
txt.apply
|
196
|
-
apply(QuotationsSecondRule)
|
117
|
+
txt.apply QuotationsFirstRule, QuotationsSecondRule
|
197
118
|
end
|
198
119
|
|
199
120
|
def clean_table_of_contents(txt)
|
200
|
-
txt.apply
|
201
|
-
|
202
|
-
apply(ConsecutiveForwardSlashRule)
|
121
|
+
txt.apply TableOfContentsRule, ConsecutivePeriodsRule,
|
122
|
+
ConsecutiveForwardSlashRule
|
203
123
|
end
|
204
124
|
|
205
125
|
def clean_consecutive_characters(txt)
|
206
|
-
txt.apply
|
207
|
-
apply(ConsecutiveForwardSlashRule)
|
126
|
+
txt.apply ConsecutivePeriodsRule, ConsecutiveForwardSlashRule
|
208
127
|
end
|
209
128
|
end
|
210
129
|
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'pragmatic_segmenter/types'
|
2
|
+
require 'pragmatic_segmenter/process'
|
3
|
+
require 'pragmatic_segmenter/cleaner'
|
4
|
+
require 'pragmatic_segmenter/rules'
|
5
|
+
|
6
|
+
require 'pragmatic_segmenter/languages/common'
|
7
|
+
|
8
|
+
require 'pragmatic_segmenter/languages/english'
|
9
|
+
require 'pragmatic_segmenter/languages/deutsch'
|
10
|
+
require 'pragmatic_segmenter/languages/hindi'
|
11
|
+
require 'pragmatic_segmenter/languages/persian'
|
12
|
+
require 'pragmatic_segmenter/languages/amharic'
|
13
|
+
require 'pragmatic_segmenter/languages/arabic'
|
14
|
+
require 'pragmatic_segmenter/languages/greek'
|
15
|
+
require 'pragmatic_segmenter/languages/armenian'
|
16
|
+
require 'pragmatic_segmenter/languages/burmese'
|
17
|
+
require 'pragmatic_segmenter/languages/urdu'
|
18
|
+
require 'pragmatic_segmenter/languages/french'
|
19
|
+
require 'pragmatic_segmenter/languages/italian'
|
20
|
+
require 'pragmatic_segmenter/languages/spanish'
|
21
|
+
require 'pragmatic_segmenter/languages/russian'
|
22
|
+
require 'pragmatic_segmenter/languages/japanese'
|
23
|
+
require 'pragmatic_segmenter/languages/dutch'
|
24
|
+
require 'pragmatic_segmenter/languages/polish'
|
25
|
+
require 'pragmatic_segmenter/languages/chinese'
|
26
|
+
|
27
|
+
module PragmaticSegmenter
|
28
|
+
module Languages
|
29
|
+
LANGUAGE_CODES = {
|
30
|
+
'en' => 'English',
|
31
|
+
'de' => 'Deutsch',
|
32
|
+
'es' => 'Spanish',
|
33
|
+
'fr' => 'French',
|
34
|
+
'it' => 'Italian',
|
35
|
+
'ja' => 'Japanese',
|
36
|
+
'el' => 'Greek',
|
37
|
+
'ru' => 'Russian',
|
38
|
+
'ar' => 'Arabic',
|
39
|
+
'am' => 'Amharic',
|
40
|
+
'hi' => 'Hindi',
|
41
|
+
'hy' => 'Armenian',
|
42
|
+
'fa' => 'Persian',
|
43
|
+
'my' => 'Burmese',
|
44
|
+
'ur' => 'Urdu',
|
45
|
+
'nl' => 'Dutch',
|
46
|
+
'pl' => 'Polish',
|
47
|
+
'zh' => 'Chinese',
|
48
|
+
}
|
49
|
+
|
50
|
+
def process_class
|
51
|
+
language_module::Process
|
52
|
+
end
|
53
|
+
|
54
|
+
def cleaner_class
|
55
|
+
language_module::Cleaner
|
56
|
+
end
|
57
|
+
|
58
|
+
def language_module
|
59
|
+
Object.const_get("PragmaticSegmenter::Languages::#{LANGUAGE_CODES[language] || 'Common'}")
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -1,36 +1,10 @@
|
|
1
1
|
module PragmaticSegmenter
|
2
2
|
module Languages
|
3
|
-
|
4
|
-
|
5
|
-
private
|
3
|
+
module Amharic
|
4
|
+
include Languages::Common
|
6
5
|
|
7
|
-
|
8
|
-
|
9
|
-
end
|
10
|
-
|
11
|
-
def punctuation_array
|
12
|
-
PragmaticSegmenter::Languages::Amharic::Punctuation.new.punct
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
class Cleaner < PragmaticSegmenter::Cleaner
|
17
|
-
end
|
18
|
-
|
19
|
-
class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
|
20
|
-
SENTENCE_BOUNDARY = /.*?[፧።!\?]|.*?$/
|
21
|
-
|
22
|
-
def split
|
23
|
-
text.scan(SENTENCE_BOUNDARY)
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
class Punctuation < PragmaticSegmenter::Punctuation
|
28
|
-
PUNCT = ['።', '፧', '?', '!']
|
29
|
-
|
30
|
-
def punct
|
31
|
-
PUNCT
|
32
|
-
end
|
33
|
-
end
|
6
|
+
SENTENCE_BOUNDARY_REGEX = /.*?[፧።!\?]|.*?$/
|
7
|
+
Punctuations = ['።', '፧', '?', '!']
|
34
8
|
end
|
35
9
|
end
|
36
10
|
end
|