pragmatic_segmenter 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +6 -6
- data/lib/pragmatic_segmenter/between_punctuation.rb +6 -4
- data/lib/pragmatic_segmenter/cleaner.rb +51 -47
- data/lib/pragmatic_segmenter/cleaner/rules.rb +86 -0
- data/lib/pragmatic_segmenter/languages.rb +21 -30
- data/lib/pragmatic_segmenter/languages/arabic.rb +0 -13
- data/lib/pragmatic_segmenter/languages/common.rb +67 -44
- data/lib/pragmatic_segmenter/languages/common/ellipsis.rb +37 -0
- data/lib/pragmatic_segmenter/languages/common/numbers.rb +90 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +25 -48
- data/lib/pragmatic_segmenter/languages/english.rb +3 -3
- data/lib/pragmatic_segmenter/languages/japanese.rb +5 -13
- data/lib/pragmatic_segmenter/languages/persian.rb +0 -14
- data/lib/pragmatic_segmenter/languages/russian.rb +0 -25
- data/lib/pragmatic_segmenter/languages/spanish.rb +0 -9
- data/lib/pragmatic_segmenter/list.rb +60 -58
- data/lib/pragmatic_segmenter/{process.rb → processor.rb} +47 -26
- data/lib/pragmatic_segmenter/punctuation_replacer.rb +41 -20
- data/lib/pragmatic_segmenter/segmenter.rb +19 -5
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/pragmatic_segmenter.gemspec +1 -0
- data/spec/pragmatic_segmenter/languages/amharic_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/arabic_spec.rb +59 -0
- data/spec/pragmatic_segmenter/languages/armenian_spec.rb +160 -0
- data/spec/pragmatic_segmenter/languages/burmese_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/chinese_spec.rb +11 -0
- data/spec/pragmatic_segmenter/languages/deutsch_spec.rb +189 -0
- data/spec/pragmatic_segmenter/languages/dutch_spec.rb +23 -0
- data/spec/pragmatic_segmenter/languages/english_spec.rb +1348 -0
- data/spec/pragmatic_segmenter/languages/french_spec.rb +31 -0
- data/spec/pragmatic_segmenter/languages/greek_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/hindi_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/italian_spec.rb +190 -0
- data/spec/pragmatic_segmenter/languages/japanese_spec.rb +53 -0
- data/spec/pragmatic_segmenter/languages/persian_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/polish_spec.rb +11 -0
- data/spec/pragmatic_segmenter/languages/russian_spec.rb +219 -0
- data/spec/pragmatic_segmenter/languages/spanish_spec.rb +189 -0
- data/spec/pragmatic_segmenter/languages/urdu_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages_spec.rb +31 -0
- data/spec/pragmatic_segmenter_spec.rb +24 -2583
- metadata +59 -8
- data/lib/pragmatic_segmenter/number.rb +0 -35
- data/lib/pragmatic_segmenter/rules.rb +0 -168
- data/lib/pragmatic_segmenter/rules/ellipsis.rb +0 -35
- data/lib/pragmatic_segmenter/rules/html.rb +0 -13
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: guard-rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
description: 'Pragmatic Segmenter is a sentence segmentation tool for Ruby. It allows
|
56
70
|
you to split a text into an array of sentences. This gem provides 2 main benefits
|
57
71
|
over other segmentation gems - 1) It works well even with ill-formatted text 2)
|
@@ -73,6 +87,7 @@ files:
|
|
73
87
|
- lib/pragmatic_segmenter/abbreviation_replacer.rb
|
74
88
|
- lib/pragmatic_segmenter/between_punctuation.rb
|
75
89
|
- lib/pragmatic_segmenter/cleaner.rb
|
90
|
+
- lib/pragmatic_segmenter/cleaner/rules.rb
|
76
91
|
- lib/pragmatic_segmenter/exclamation_words.rb
|
77
92
|
- lib/pragmatic_segmenter/languages.rb
|
78
93
|
- lib/pragmatic_segmenter/languages/amharic.rb
|
@@ -81,6 +96,8 @@ files:
|
|
81
96
|
- lib/pragmatic_segmenter/languages/burmese.rb
|
82
97
|
- lib/pragmatic_segmenter/languages/chinese.rb
|
83
98
|
- lib/pragmatic_segmenter/languages/common.rb
|
99
|
+
- lib/pragmatic_segmenter/languages/common/ellipsis.rb
|
100
|
+
- lib/pragmatic_segmenter/languages/common/numbers.rb
|
84
101
|
- lib/pragmatic_segmenter/languages/deutsch.rb
|
85
102
|
- lib/pragmatic_segmenter/languages/dutch.rb
|
86
103
|
- lib/pragmatic_segmenter/languages/english.rb
|
@@ -95,17 +112,32 @@ files:
|
|
95
112
|
- lib/pragmatic_segmenter/languages/spanish.rb
|
96
113
|
- lib/pragmatic_segmenter/languages/urdu.rb
|
97
114
|
- lib/pragmatic_segmenter/list.rb
|
98
|
-
- lib/pragmatic_segmenter/
|
99
|
-
- lib/pragmatic_segmenter/process.rb
|
115
|
+
- lib/pragmatic_segmenter/processor.rb
|
100
116
|
- lib/pragmatic_segmenter/punctuation_replacer.rb
|
101
|
-
- lib/pragmatic_segmenter/rules.rb
|
102
|
-
- lib/pragmatic_segmenter/rules/ellipsis.rb
|
103
|
-
- lib/pragmatic_segmenter/rules/html.rb
|
104
117
|
- lib/pragmatic_segmenter/segmenter.rb
|
105
118
|
- lib/pragmatic_segmenter/types.rb
|
106
119
|
- lib/pragmatic_segmenter/version.rb
|
107
120
|
- pragmatic_segmenter.gemspec
|
108
121
|
- spec/performance_spec.rb
|
122
|
+
- spec/pragmatic_segmenter/languages/amharic_spec.rb
|
123
|
+
- spec/pragmatic_segmenter/languages/arabic_spec.rb
|
124
|
+
- spec/pragmatic_segmenter/languages/armenian_spec.rb
|
125
|
+
- spec/pragmatic_segmenter/languages/burmese_spec.rb
|
126
|
+
- spec/pragmatic_segmenter/languages/chinese_spec.rb
|
127
|
+
- spec/pragmatic_segmenter/languages/deutsch_spec.rb
|
128
|
+
- spec/pragmatic_segmenter/languages/dutch_spec.rb
|
129
|
+
- spec/pragmatic_segmenter/languages/english_spec.rb
|
130
|
+
- spec/pragmatic_segmenter/languages/french_spec.rb
|
131
|
+
- spec/pragmatic_segmenter/languages/greek_spec.rb
|
132
|
+
- spec/pragmatic_segmenter/languages/hindi_spec.rb
|
133
|
+
- spec/pragmatic_segmenter/languages/italian_spec.rb
|
134
|
+
- spec/pragmatic_segmenter/languages/japanese_spec.rb
|
135
|
+
- spec/pragmatic_segmenter/languages/persian_spec.rb
|
136
|
+
- spec/pragmatic_segmenter/languages/polish_spec.rb
|
137
|
+
- spec/pragmatic_segmenter/languages/russian_spec.rb
|
138
|
+
- spec/pragmatic_segmenter/languages/spanish_spec.rb
|
139
|
+
- spec/pragmatic_segmenter/languages/urdu_spec.rb
|
140
|
+
- spec/pragmatic_segmenter/languages_spec.rb
|
109
141
|
- spec/pragmatic_segmenter_spec.rb
|
110
142
|
- spec/spec_helper.rb
|
111
143
|
homepage: https://github.com/diasks2/pragmatic_segmenter
|
@@ -128,12 +160,31 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
128
160
|
version: '0'
|
129
161
|
requirements: []
|
130
162
|
rubyforge_project:
|
131
|
-
rubygems_version: 2.4.
|
163
|
+
rubygems_version: 2.4.8
|
132
164
|
signing_key:
|
133
165
|
specification_version: 4
|
134
166
|
summary: A rule-based sentence boundary detection gem that works out-of-the-box across
|
135
167
|
many languages
|
136
168
|
test_files:
|
137
169
|
- spec/performance_spec.rb
|
170
|
+
- spec/pragmatic_segmenter/languages/amharic_spec.rb
|
171
|
+
- spec/pragmatic_segmenter/languages/arabic_spec.rb
|
172
|
+
- spec/pragmatic_segmenter/languages/armenian_spec.rb
|
173
|
+
- spec/pragmatic_segmenter/languages/burmese_spec.rb
|
174
|
+
- spec/pragmatic_segmenter/languages/chinese_spec.rb
|
175
|
+
- spec/pragmatic_segmenter/languages/deutsch_spec.rb
|
176
|
+
- spec/pragmatic_segmenter/languages/dutch_spec.rb
|
177
|
+
- spec/pragmatic_segmenter/languages/english_spec.rb
|
178
|
+
- spec/pragmatic_segmenter/languages/french_spec.rb
|
179
|
+
- spec/pragmatic_segmenter/languages/greek_spec.rb
|
180
|
+
- spec/pragmatic_segmenter/languages/hindi_spec.rb
|
181
|
+
- spec/pragmatic_segmenter/languages/italian_spec.rb
|
182
|
+
- spec/pragmatic_segmenter/languages/japanese_spec.rb
|
183
|
+
- spec/pragmatic_segmenter/languages/persian_spec.rb
|
184
|
+
- spec/pragmatic_segmenter/languages/polish_spec.rb
|
185
|
+
- spec/pragmatic_segmenter/languages/russian_spec.rb
|
186
|
+
- spec/pragmatic_segmenter/languages/spanish_spec.rb
|
187
|
+
- spec/pragmatic_segmenter/languages/urdu_spec.rb
|
188
|
+
- spec/pragmatic_segmenter/languages_spec.rb
|
138
189
|
- spec/pragmatic_segmenter_spec.rb
|
139
190
|
- spec/spec_helper.rb
|
@@ -1,35 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module PragmaticSegmenter
|
4
|
-
# This class searches for numbers with periods within a string and
|
5
|
-
# replaces the periods.
|
6
|
-
class Number
|
7
|
-
# Rubular: http://rubular.com/r/oNyxBOqbyy
|
8
|
-
PeriodBeforeNumberRule = Rule.new(/\.(?=\d)/, '∯')
|
9
|
-
|
10
|
-
# Rubular: http://rubular.com/r/EMk5MpiUzt
|
11
|
-
NumberAfterPeriodBeforeLetterRule = Rule.new(/(?<=\d)\.(?=\S)/, '∯')
|
12
|
-
|
13
|
-
# Rubular: http://rubular.com/r/rf4l1HjtjG
|
14
|
-
NewLineNumberPeriodSpaceLetterRule = Rule.new(/(?<=\r\d)\.(?=(\s\S)|\))/, '∯')
|
15
|
-
|
16
|
-
# Rubular: http://rubular.com/r/HPa4sdc6b9
|
17
|
-
StartLineNumberPeriodRule = Rule.new(/(?<=^\d)\.(?=(\s\S)|\))/, '∯')
|
18
|
-
|
19
|
-
# Rubular: http://rubular.com/r/NuvWnKleFl
|
20
|
-
StartLineTwoDigitNumberPeriodRule = Rule.new(/(?<=^\d\d)\.(?=(\s\S)|\))/, '∯')
|
21
|
-
|
22
|
-
attr_reader :text
|
23
|
-
def initialize(text:)
|
24
|
-
@text = Text.new(text)
|
25
|
-
end
|
26
|
-
|
27
|
-
def replace
|
28
|
-
@text.apply PeriodBeforeNumberRule,
|
29
|
-
NumberAfterPeriodBeforeLetterRule,
|
30
|
-
NewLineNumberPeriodSpaceLetterRule,
|
31
|
-
StartLineNumberPeriodRule,
|
32
|
-
StartLineTwoDigitNumberPeriodRule
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
@@ -1,168 +0,0 @@
|
|
1
|
-
require 'pragmatic_segmenter/rules/html'
|
2
|
-
|
3
|
-
module PragmaticSegmenter
|
4
|
-
module Rules
|
5
|
-
|
6
|
-
URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']
|
7
|
-
|
8
|
-
# Rubular: http://rubular.com/r/6dt98uI76u
|
9
|
-
NO_SPACE_BETWEEN_SENTENCES_REGEX = /(?<=[a-z])\.(?=[A-Z])/
|
10
|
-
|
11
|
-
# Rubular: http://rubular.com/r/l6KN6rH5XE
|
12
|
-
NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = /(?<=\d)\.(?=[A-Z])/
|
13
|
-
|
14
|
-
# Rubular: http://rubular.com/r/V57WnM9Zut
|
15
|
-
NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
|
16
|
-
|
17
|
-
# Rubular: http://rubular.com/r/3GiRiP2IbD
|
18
|
-
NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = /(?<=\s)\n(?=([a-z]|\())/
|
19
|
-
|
20
|
-
# Rubular: http://rubular.com/r/UZAVcwqck8
|
21
|
-
PDF_NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '')
|
22
|
-
|
23
|
-
# Rubular: http://rubular.com/r/eaNwGavmdo
|
24
|
-
PDF_NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ')
|
25
|
-
|
26
|
-
# Rubular: http://rubular.com/r/bAJrhyLNeZ
|
27
|
-
InlineFormattingRule = Rule.new(/\{b\^>\d*<b\^\}|\{b\^>\d*<b\^\}/, '')
|
28
|
-
|
29
|
-
# Rubular: http://rubular.com/r/dMxp5MixFS
|
30
|
-
DoubleNewLineWithSpaceRule = Rule.new(/\n \n/, "\r")
|
31
|
-
|
32
|
-
# Rubular: http://rubular.com/r/H6HOJeA8bq
|
33
|
-
DoubleNewLineRule = Rule.new(/\n\n/, "\r")
|
34
|
-
|
35
|
-
# Rubular: http://rubular.com/r/Gn18aAnLdZ
|
36
|
-
NewLineFollowedByBulletRule = Rule.new(/\n(?=•)/, "\r")
|
37
|
-
|
38
|
-
# Rubular: http://rubular.com/r/FseyMiiYFT
|
39
|
-
NewLineFollowedByPeriodRule = Rule.new(/\n(?=\.(\s|\n))/, '')
|
40
|
-
|
41
|
-
# Rubular: http://rubular.com/r/8mc1ArOIGy
|
42
|
-
TableOfContentsRule = Rule.new(/\.{5,}\s*\d+-*\d*/, "\r")
|
43
|
-
|
44
|
-
# Rubular: http://rubular.com/r/DwNSuZrNtk
|
45
|
-
ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
|
46
|
-
|
47
|
-
# Rubular: http://rubular.com/r/IQ4TPfsbd8
|
48
|
-
ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
|
49
|
-
|
50
|
-
# Rubular: http://rubular.com/r/6dt98uI76u
|
51
|
-
NoSpaceBetweenSentencesRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
|
52
|
-
|
53
|
-
# Rubular: http://rubular.com/r/l6KN6rH5XE
|
54
|
-
NoSpaceBetweenSentencesDigitRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
|
55
|
-
|
56
|
-
EscapedCarriageReturnRule = Rule.new(/\\r/, "\r")
|
57
|
-
TypoEscapedCarriageReturnRule = Rule.new(/\\\ r/, "\r")
|
58
|
-
|
59
|
-
EscapedNewLineRule = Rule.new(/\\n/, "\n")
|
60
|
-
TypoEscapedNewLineRule = Rule.new(/\\\ n/, "\n")
|
61
|
-
|
62
|
-
ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
|
63
|
-
|
64
|
-
QuotationsFirstRule = Rule.new(/''/, '"')
|
65
|
-
QuotationsSecondRule = Rule.new(/``/, '"')
|
66
|
-
|
67
|
-
# Rubular: http://rubular.com/r/EUbZCNfgei
|
68
|
-
AbbreviationsWithMultiplePeriodsAndEmailRule = Rule.new(/(\w)(\.)(\w)/, '\1∮\3')
|
69
|
-
|
70
|
-
# Rubular: http://rubular.com/r/G2opjedIm9
|
71
|
-
GeoLocationRule = Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '∯')
|
72
|
-
|
73
|
-
SingleNewLineRule = Rule.new(/\n/, 'ȹ')
|
74
|
-
|
75
|
-
SubSingleQuoteRule = Rule.new(/&⎋&/, "'")
|
76
|
-
|
77
|
-
ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
|
78
|
-
|
79
|
-
# Rubular: http://rubular.com/r/aXPUGm6fQh
|
80
|
-
QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '&ᓷ&')
|
81
|
-
|
82
|
-
module ExclamationPointRules
|
83
|
-
# Rubular: http://rubular.com/r/XS1XXFRfM2
|
84
|
-
InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '&ᓴ&')
|
85
|
-
|
86
|
-
# Rubular: http://rubular.com/r/sl57YI8LkA
|
87
|
-
BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '&ᓴ&')
|
88
|
-
|
89
|
-
# Rubular: http://rubular.com/r/f9zTjmkIPb
|
90
|
-
MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '&ᓴ&')
|
91
|
-
|
92
|
-
All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
|
93
|
-
end
|
94
|
-
|
95
|
-
module DoublePunctuationRules
|
96
|
-
FirstRule = Rule.new(/\?!/, '☉')
|
97
|
-
SecondRule = Rule.new(/!\?/, '☈')
|
98
|
-
ThirdRule = Rule.new(/\?\?/, '☇')
|
99
|
-
ForthRule = Rule.new(/!!/, '☄')
|
100
|
-
|
101
|
-
All = [ FirstRule, SecondRule, ThirdRule, ForthRule ]
|
102
|
-
end
|
103
|
-
|
104
|
-
module ReinsertEllipsisRules
|
105
|
-
SubThreeConsecutivePeriod = Rule.new(/ƪ/, '...')
|
106
|
-
SubThreeSpacePeriod = Rule.new(/♟/, ' . . . ')
|
107
|
-
SubFourSpacePeriod = Rule.new(/♝/, '. . . .')
|
108
|
-
SubTwoConsecutivePeriod = Rule.new(/☏/, '..')
|
109
|
-
SubOnePeriod = Rule.new(/∮/, '.')
|
110
|
-
|
111
|
-
All = [ SubThreeConsecutivePeriod, SubThreeSpacePeriod,
|
112
|
-
SubFourSpacePeriod, SubTwoConsecutivePeriod,
|
113
|
-
SubOnePeriod ]
|
114
|
-
end
|
115
|
-
|
116
|
-
module SubSymbolsRules
|
117
|
-
Period = Rule.new(/∯/, '.')
|
118
|
-
ArabicComma = Rule.new(/♬/, '،')
|
119
|
-
SemiColon = Rule.new(/♭/, ':')
|
120
|
-
FullWidthPeriod = Rule.new(/&ᓰ&/, '。')
|
121
|
-
SpecialPeriod = Rule.new(/&ᓱ&/, '.')
|
122
|
-
FullWidthExclamation = Rule.new(/&ᓳ&/, '!')
|
123
|
-
ExclamationPoint = Rule.new(/&ᓴ&/, '!')
|
124
|
-
QuestionMark = Rule.new(/&ᓷ&/, '?')
|
125
|
-
FullWidthQuestionMark = Rule.new(/&ᓸ&/, '?')
|
126
|
-
MixedDoubleQE = Rule.new(/☉/, '?!')
|
127
|
-
MixedDoubleQQ = Rule.new(/☇/, '??')
|
128
|
-
MixedDoubleEQ = Rule.new(/☈/, '!?')
|
129
|
-
MixedDoubleEE = Rule.new(/☄/, '!!')
|
130
|
-
LeftParens = Rule.new(/&✂&/, '(')
|
131
|
-
RightParens = Rule.new(/&⌬&/, ')')
|
132
|
-
TemporaryEndingPunctutation = Rule.new('ȸ', '')
|
133
|
-
Newline = Rule.new(/ȹ/, "\n")
|
134
|
-
|
135
|
-
All = [ Period, ArabicComma,
|
136
|
-
SemiColon, FullWidthPeriod,
|
137
|
-
SpecialPeriod, FullWidthExclamation,
|
138
|
-
ExclamationPoint, QuestionMark,
|
139
|
-
FullWidthQuestionMark, MixedDoubleQE,
|
140
|
-
MixedDoubleQQ, MixedDoubleEQ,
|
141
|
-
MixedDoubleEE, LeftParens,
|
142
|
-
RightParens, TemporaryEndingPunctutation,
|
143
|
-
Newline ]
|
144
|
-
end
|
145
|
-
|
146
|
-
module EscapeRegexReservedCharacters
|
147
|
-
LeftParen = Rule.new('(', '\\(')
|
148
|
-
RightParen = Rule.new(')', '\\)')
|
149
|
-
LeftBracket = Rule.new('[', '\\[')
|
150
|
-
RightBracket = Rule.new(']', '\\]')
|
151
|
-
Dash = Rule.new('-', '\\-')
|
152
|
-
|
153
|
-
All = [ LeftParen, RightParen,
|
154
|
-
LeftBracket, RightBracket, Dash ]
|
155
|
-
end
|
156
|
-
|
157
|
-
module SubEscapedRegexReservedCharacters
|
158
|
-
SubLeftParen = Rule.new('\\(', '(')
|
159
|
-
SubRightParen = Rule.new('\\)', ')')
|
160
|
-
SubLeftBracket = Rule.new('\\[', '[')
|
161
|
-
SubRightBracket = Rule.new('\\]', ']')
|
162
|
-
SubDash = Rule.new('\\-', '-')
|
163
|
-
|
164
|
-
All = [ SubLeftParen, SubRightParen,
|
165
|
-
SubLeftBracket, SubRightBracket, SubDash ]
|
166
|
-
end
|
167
|
-
end
|
168
|
-
end
|
@@ -1,35 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module PragmaticSegmenter
|
4
|
-
module Rules
|
5
|
-
# This class searches for ellipses within a string and
|
6
|
-
# replaces the periods.
|
7
|
-
|
8
|
-
# http://www.dailywritingtips.com/in-search-of-a-4-dot-ellipsis/
|
9
|
-
# http://www.thepunctuationguide.com/ellipses.html
|
10
|
-
|
11
|
-
module EllipsisRules
|
12
|
-
# Rubular: http://rubular.com/r/i60hCK81fz
|
13
|
-
ThreeConsecutiveRule = Rule.new(/\.\.\.(?=\s+[A-Z])/, '☏.')
|
14
|
-
|
15
|
-
# Rubular: http://rubular.com/r/Hdqpd90owl
|
16
|
-
FourConsecutiveRule = Rule.new(/(?<=\S)\.{3}(?=\.\s[A-Z])/, 'ƪ')
|
17
|
-
|
18
|
-
# Rubular: http://rubular.com/r/YBG1dIHTRu
|
19
|
-
ThreeSpaceRule = Rule.new(/(\s\.){3}\s/, '♟')
|
20
|
-
|
21
|
-
# Rubular: http://rubular.com/r/2VvZ8wRbd8
|
22
|
-
FourSpaceRule = Rule.new(/(?<=[a-z])(\.\s){3}\.(\z|$|\n)/, '♝')
|
23
|
-
|
24
|
-
OtherThreePeriodRule = Rule.new(/\.\.\./, 'ƪ')
|
25
|
-
|
26
|
-
All = [
|
27
|
-
ThreeSpaceRule,
|
28
|
-
FourSpaceRule,
|
29
|
-
FourConsecutiveRule,
|
30
|
-
ThreeConsecutiveRule,
|
31
|
-
OtherThreePeriodRule
|
32
|
-
]
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
@@ -1,13 +0,0 @@
|
|
1
|
-
module PragmaticSegmenter
|
2
|
-
module Rules
|
3
|
-
module HTMLRules
|
4
|
-
# Rubular: http://rubular.com/r/ENrVFMdJ8v
|
5
|
-
HTMLTagRule = Rule.new(/<\/?[^>]*>/, '')
|
6
|
-
|
7
|
-
# Rubular: http://rubular.com/r/XZVqMPJhea
|
8
|
-
EscapedHTMLTagRule = Rule.new(/<\/?[^gt;]*gt;/, '')
|
9
|
-
|
10
|
-
All = [HTMLTagRule, EscapedHTMLTagRule]
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|