pragmatic_segmenter 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +1 -0
  3. data/lib/pragmatic_segmenter/abbreviation_replacer.rb +6 -6
  4. data/lib/pragmatic_segmenter/between_punctuation.rb +6 -4
  5. data/lib/pragmatic_segmenter/cleaner.rb +51 -47
  6. data/lib/pragmatic_segmenter/cleaner/rules.rb +86 -0
  7. data/lib/pragmatic_segmenter/languages.rb +21 -30
  8. data/lib/pragmatic_segmenter/languages/arabic.rb +0 -13
  9. data/lib/pragmatic_segmenter/languages/common.rb +67 -44
  10. data/lib/pragmatic_segmenter/languages/common/ellipsis.rb +37 -0
  11. data/lib/pragmatic_segmenter/languages/common/numbers.rb +90 -0
  12. data/lib/pragmatic_segmenter/languages/deutsch.rb +25 -48
  13. data/lib/pragmatic_segmenter/languages/english.rb +3 -3
  14. data/lib/pragmatic_segmenter/languages/japanese.rb +5 -13
  15. data/lib/pragmatic_segmenter/languages/persian.rb +0 -14
  16. data/lib/pragmatic_segmenter/languages/russian.rb +0 -25
  17. data/lib/pragmatic_segmenter/languages/spanish.rb +0 -9
  18. data/lib/pragmatic_segmenter/list.rb +60 -58
  19. data/lib/pragmatic_segmenter/{process.rb → processor.rb} +47 -26
  20. data/lib/pragmatic_segmenter/punctuation_replacer.rb +41 -20
  21. data/lib/pragmatic_segmenter/segmenter.rb +19 -5
  22. data/lib/pragmatic_segmenter/version.rb +1 -1
  23. data/pragmatic_segmenter.gemspec +1 -0
  24. data/spec/pragmatic_segmenter/languages/amharic_spec.rb +18 -0
  25. data/spec/pragmatic_segmenter/languages/arabic_spec.rb +59 -0
  26. data/spec/pragmatic_segmenter/languages/armenian_spec.rb +160 -0
  27. data/spec/pragmatic_segmenter/languages/burmese_spec.rb +18 -0
  28. data/spec/pragmatic_segmenter/languages/chinese_spec.rb +11 -0
  29. data/spec/pragmatic_segmenter/languages/deutsch_spec.rb +189 -0
  30. data/spec/pragmatic_segmenter/languages/dutch_spec.rb +23 -0
  31. data/spec/pragmatic_segmenter/languages/english_spec.rb +1348 -0
  32. data/spec/pragmatic_segmenter/languages/french_spec.rb +31 -0
  33. data/spec/pragmatic_segmenter/languages/greek_spec.rb +18 -0
  34. data/spec/pragmatic_segmenter/languages/hindi_spec.rb +18 -0
  35. data/spec/pragmatic_segmenter/languages/italian_spec.rb +190 -0
  36. data/spec/pragmatic_segmenter/languages/japanese_spec.rb +53 -0
  37. data/spec/pragmatic_segmenter/languages/persian_spec.rb +18 -0
  38. data/spec/pragmatic_segmenter/languages/polish_spec.rb +11 -0
  39. data/spec/pragmatic_segmenter/languages/russian_spec.rb +219 -0
  40. data/spec/pragmatic_segmenter/languages/spanish_spec.rb +189 -0
  41. data/spec/pragmatic_segmenter/languages/urdu_spec.rb +18 -0
  42. data/spec/pragmatic_segmenter/languages_spec.rb +31 -0
  43. data/spec/pragmatic_segmenter_spec.rb +24 -2583
  44. metadata +59 -8
  45. data/lib/pragmatic_segmenter/number.rb +0 -35
  46. data/lib/pragmatic_segmenter/rules.rb +0 -168
  47. data/lib/pragmatic_segmenter/rules/ellipsis.rb +0 -35
  48. data/lib/pragmatic_segmenter/rules/html.rb +0 -13
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-27 00:00:00.000000000 Z
11
+ date: 2015-12-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: guard-rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  description: 'Pragmatic Segmenter is a sentence segmentation tool for Ruby. It allows
56
70
  you to split a text into an array of sentences. This gem provides 2 main benefits
57
71
  over other segmentation gems - 1) It works well even with ill-formatted text 2)
@@ -73,6 +87,7 @@ files:
73
87
  - lib/pragmatic_segmenter/abbreviation_replacer.rb
74
88
  - lib/pragmatic_segmenter/between_punctuation.rb
75
89
  - lib/pragmatic_segmenter/cleaner.rb
90
+ - lib/pragmatic_segmenter/cleaner/rules.rb
76
91
  - lib/pragmatic_segmenter/exclamation_words.rb
77
92
  - lib/pragmatic_segmenter/languages.rb
78
93
  - lib/pragmatic_segmenter/languages/amharic.rb
@@ -81,6 +96,8 @@ files:
81
96
  - lib/pragmatic_segmenter/languages/burmese.rb
82
97
  - lib/pragmatic_segmenter/languages/chinese.rb
83
98
  - lib/pragmatic_segmenter/languages/common.rb
99
+ - lib/pragmatic_segmenter/languages/common/ellipsis.rb
100
+ - lib/pragmatic_segmenter/languages/common/numbers.rb
84
101
  - lib/pragmatic_segmenter/languages/deutsch.rb
85
102
  - lib/pragmatic_segmenter/languages/dutch.rb
86
103
  - lib/pragmatic_segmenter/languages/english.rb
@@ -95,17 +112,32 @@ files:
95
112
  - lib/pragmatic_segmenter/languages/spanish.rb
96
113
  - lib/pragmatic_segmenter/languages/urdu.rb
97
114
  - lib/pragmatic_segmenter/list.rb
98
- - lib/pragmatic_segmenter/number.rb
99
- - lib/pragmatic_segmenter/process.rb
115
+ - lib/pragmatic_segmenter/processor.rb
100
116
  - lib/pragmatic_segmenter/punctuation_replacer.rb
101
- - lib/pragmatic_segmenter/rules.rb
102
- - lib/pragmatic_segmenter/rules/ellipsis.rb
103
- - lib/pragmatic_segmenter/rules/html.rb
104
117
  - lib/pragmatic_segmenter/segmenter.rb
105
118
  - lib/pragmatic_segmenter/types.rb
106
119
  - lib/pragmatic_segmenter/version.rb
107
120
  - pragmatic_segmenter.gemspec
108
121
  - spec/performance_spec.rb
122
+ - spec/pragmatic_segmenter/languages/amharic_spec.rb
123
+ - spec/pragmatic_segmenter/languages/arabic_spec.rb
124
+ - spec/pragmatic_segmenter/languages/armenian_spec.rb
125
+ - spec/pragmatic_segmenter/languages/burmese_spec.rb
126
+ - spec/pragmatic_segmenter/languages/chinese_spec.rb
127
+ - spec/pragmatic_segmenter/languages/deutsch_spec.rb
128
+ - spec/pragmatic_segmenter/languages/dutch_spec.rb
129
+ - spec/pragmatic_segmenter/languages/english_spec.rb
130
+ - spec/pragmatic_segmenter/languages/french_spec.rb
131
+ - spec/pragmatic_segmenter/languages/greek_spec.rb
132
+ - spec/pragmatic_segmenter/languages/hindi_spec.rb
133
+ - spec/pragmatic_segmenter/languages/italian_spec.rb
134
+ - spec/pragmatic_segmenter/languages/japanese_spec.rb
135
+ - spec/pragmatic_segmenter/languages/persian_spec.rb
136
+ - spec/pragmatic_segmenter/languages/polish_spec.rb
137
+ - spec/pragmatic_segmenter/languages/russian_spec.rb
138
+ - spec/pragmatic_segmenter/languages/spanish_spec.rb
139
+ - spec/pragmatic_segmenter/languages/urdu_spec.rb
140
+ - spec/pragmatic_segmenter/languages_spec.rb
109
141
  - spec/pragmatic_segmenter_spec.rb
110
142
  - spec/spec_helper.rb
111
143
  homepage: https://github.com/diasks2/pragmatic_segmenter
@@ -128,12 +160,31 @@ required_rubygems_version: !ruby/object:Gem::Requirement
128
160
  version: '0'
129
161
  requirements: []
130
162
  rubyforge_project:
131
- rubygems_version: 2.4.1
163
+ rubygems_version: 2.4.8
132
164
  signing_key:
133
165
  specification_version: 4
134
166
  summary: A rule-based sentence boundary detection gem that works out-of-the-box across
135
167
  many languages
136
168
  test_files:
137
169
  - spec/performance_spec.rb
170
+ - spec/pragmatic_segmenter/languages/amharic_spec.rb
171
+ - spec/pragmatic_segmenter/languages/arabic_spec.rb
172
+ - spec/pragmatic_segmenter/languages/armenian_spec.rb
173
+ - spec/pragmatic_segmenter/languages/burmese_spec.rb
174
+ - spec/pragmatic_segmenter/languages/chinese_spec.rb
175
+ - spec/pragmatic_segmenter/languages/deutsch_spec.rb
176
+ - spec/pragmatic_segmenter/languages/dutch_spec.rb
177
+ - spec/pragmatic_segmenter/languages/english_spec.rb
178
+ - spec/pragmatic_segmenter/languages/french_spec.rb
179
+ - spec/pragmatic_segmenter/languages/greek_spec.rb
180
+ - spec/pragmatic_segmenter/languages/hindi_spec.rb
181
+ - spec/pragmatic_segmenter/languages/italian_spec.rb
182
+ - spec/pragmatic_segmenter/languages/japanese_spec.rb
183
+ - spec/pragmatic_segmenter/languages/persian_spec.rb
184
+ - spec/pragmatic_segmenter/languages/polish_spec.rb
185
+ - spec/pragmatic_segmenter/languages/russian_spec.rb
186
+ - spec/pragmatic_segmenter/languages/spanish_spec.rb
187
+ - spec/pragmatic_segmenter/languages/urdu_spec.rb
188
+ - spec/pragmatic_segmenter/languages_spec.rb
138
189
  - spec/pragmatic_segmenter_spec.rb
139
190
  - spec/spec_helper.rb
@@ -1,35 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module PragmaticSegmenter
4
- # This class searches for numbers with periods within a string and
5
- # replaces the periods.
6
- class Number
7
- # Rubular: http://rubular.com/r/oNyxBOqbyy
8
- PeriodBeforeNumberRule = Rule.new(/\.(?=\d)/, '∯')
9
-
10
- # Rubular: http://rubular.com/r/EMk5MpiUzt
11
- NumberAfterPeriodBeforeLetterRule = Rule.new(/(?<=\d)\.(?=\S)/, '∯')
12
-
13
- # Rubular: http://rubular.com/r/rf4l1HjtjG
14
- NewLineNumberPeriodSpaceLetterRule = Rule.new(/(?<=\r\d)\.(?=(\s\S)|\))/, '∯')
15
-
16
- # Rubular: http://rubular.com/r/HPa4sdc6b9
17
- StartLineNumberPeriodRule = Rule.new(/(?<=^\d)\.(?=(\s\S)|\))/, '∯')
18
-
19
- # Rubular: http://rubular.com/r/NuvWnKleFl
20
- StartLineTwoDigitNumberPeriodRule = Rule.new(/(?<=^\d\d)\.(?=(\s\S)|\))/, '∯')
21
-
22
- attr_reader :text
23
- def initialize(text:)
24
- @text = Text.new(text)
25
- end
26
-
27
- def replace
28
- @text.apply PeriodBeforeNumberRule,
29
- NumberAfterPeriodBeforeLetterRule,
30
- NewLineNumberPeriodSpaceLetterRule,
31
- StartLineNumberPeriodRule,
32
- StartLineTwoDigitNumberPeriodRule
33
- end
34
- end
35
- end
@@ -1,168 +0,0 @@
1
- require 'pragmatic_segmenter/rules/html'
2
-
3
- module PragmaticSegmenter
4
- module Rules
5
-
6
- URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']
7
-
8
- # Rubular: http://rubular.com/r/6dt98uI76u
9
- NO_SPACE_BETWEEN_SENTENCES_REGEX = /(?<=[a-z])\.(?=[A-Z])/
10
-
11
- # Rubular: http://rubular.com/r/l6KN6rH5XE
12
- NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = /(?<=\d)\.(?=[A-Z])/
13
-
14
- # Rubular: http://rubular.com/r/V57WnM9Zut
15
- NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
16
-
17
- # Rubular: http://rubular.com/r/3GiRiP2IbD
18
- NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = /(?<=\s)\n(?=([a-z]|\())/
19
-
20
- # Rubular: http://rubular.com/r/UZAVcwqck8
21
- PDF_NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '')
22
-
23
- # Rubular: http://rubular.com/r/eaNwGavmdo
24
- PDF_NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ')
25
-
26
- # Rubular: http://rubular.com/r/bAJrhyLNeZ
27
- InlineFormattingRule = Rule.new(/\{b\^&gt;\d*&lt;b\^\}|\{b\^>\d*<b\^\}/, '')
28
-
29
- # Rubular: http://rubular.com/r/dMxp5MixFS
30
- DoubleNewLineWithSpaceRule = Rule.new(/\n \n/, "\r")
31
-
32
- # Rubular: http://rubular.com/r/H6HOJeA8bq
33
- DoubleNewLineRule = Rule.new(/\n\n/, "\r")
34
-
35
- # Rubular: http://rubular.com/r/Gn18aAnLdZ
36
- NewLineFollowedByBulletRule = Rule.new(/\n(?=•)/, "\r")
37
-
38
- # Rubular: http://rubular.com/r/FseyMiiYFT
39
- NewLineFollowedByPeriodRule = Rule.new(/\n(?=\.(\s|\n))/, '')
40
-
41
- # Rubular: http://rubular.com/r/8mc1ArOIGy
42
- TableOfContentsRule = Rule.new(/\.{5,}\s*\d+-*\d*/, "\r")
43
-
44
- # Rubular: http://rubular.com/r/DwNSuZrNtk
45
- ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
46
-
47
- # Rubular: http://rubular.com/r/IQ4TPfsbd8
48
- ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
49
-
50
- # Rubular: http://rubular.com/r/6dt98uI76u
51
- NoSpaceBetweenSentencesRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
52
-
53
- # Rubular: http://rubular.com/r/l6KN6rH5XE
54
- NoSpaceBetweenSentencesDigitRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
55
-
56
- EscapedCarriageReturnRule = Rule.new(/\\r/, "\r")
57
- TypoEscapedCarriageReturnRule = Rule.new(/\\\ r/, "\r")
58
-
59
- EscapedNewLineRule = Rule.new(/\\n/, "\n")
60
- TypoEscapedNewLineRule = Rule.new(/\\\ n/, "\n")
61
-
62
- ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
63
-
64
- QuotationsFirstRule = Rule.new(/''/, '"')
65
- QuotationsSecondRule = Rule.new(/``/, '"')
66
-
67
- # Rubular: http://rubular.com/r/EUbZCNfgei
68
- AbbreviationsWithMultiplePeriodsAndEmailRule = Rule.new(/(\w)(\.)(\w)/, '\1∮\3')
69
-
70
- # Rubular: http://rubular.com/r/G2opjedIm9
71
- GeoLocationRule = Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '∯')
72
-
73
- SingleNewLineRule = Rule.new(/\n/, 'ȹ')
74
-
75
- SubSingleQuoteRule = Rule.new(/&⎋&/, "'")
76
-
77
- ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
78
-
79
- # Rubular: http://rubular.com/r/aXPUGm6fQh
80
- QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '&ᓷ&')
81
-
82
- module ExclamationPointRules
83
- # Rubular: http://rubular.com/r/XS1XXFRfM2
84
- InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '&ᓴ&')
85
-
86
- # Rubular: http://rubular.com/r/sl57YI8LkA
87
- BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '&ᓴ&')
88
-
89
- # Rubular: http://rubular.com/r/f9zTjmkIPb
90
- MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '&ᓴ&')
91
-
92
- All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
93
- end
94
-
95
- module DoublePunctuationRules
96
- FirstRule = Rule.new(/\?!/, '☉')
97
- SecondRule = Rule.new(/!\?/, '☈')
98
- ThirdRule = Rule.new(/\?\?/, '☇')
99
- ForthRule = Rule.new(/!!/, '☄')
100
-
101
- All = [ FirstRule, SecondRule, ThirdRule, ForthRule ]
102
- end
103
-
104
- module ReinsertEllipsisRules
105
- SubThreeConsecutivePeriod = Rule.new(/ƪ/, '...')
106
- SubThreeSpacePeriod = Rule.new(/♟/, ' . . . ')
107
- SubFourSpacePeriod = Rule.new(/♝/, '. . . .')
108
- SubTwoConsecutivePeriod = Rule.new(/☏/, '..')
109
- SubOnePeriod = Rule.new(/∮/, '.')
110
-
111
- All = [ SubThreeConsecutivePeriod, SubThreeSpacePeriod,
112
- SubFourSpacePeriod, SubTwoConsecutivePeriod,
113
- SubOnePeriod ]
114
- end
115
-
116
- module SubSymbolsRules
117
- Period = Rule.new(/∯/, '.')
118
- ArabicComma = Rule.new(/♬/, '،')
119
- SemiColon = Rule.new(/♭/, ':')
120
- FullWidthPeriod = Rule.new(/&ᓰ&/, '。')
121
- SpecialPeriod = Rule.new(/&ᓱ&/, '.')
122
- FullWidthExclamation = Rule.new(/&ᓳ&/, '!')
123
- ExclamationPoint = Rule.new(/&ᓴ&/, '!')
124
- QuestionMark = Rule.new(/&ᓷ&/, '?')
125
- FullWidthQuestionMark = Rule.new(/&ᓸ&/, '?')
126
- MixedDoubleQE = Rule.new(/☉/, '?!')
127
- MixedDoubleQQ = Rule.new(/☇/, '??')
128
- MixedDoubleEQ = Rule.new(/☈/, '!?')
129
- MixedDoubleEE = Rule.new(/☄/, '!!')
130
- LeftParens = Rule.new(/&✂&/, '(')
131
- RightParens = Rule.new(/&⌬&/, ')')
132
- TemporaryEndingPunctutation = Rule.new('ȸ', '')
133
- Newline = Rule.new(/ȹ/, "\n")
134
-
135
- All = [ Period, ArabicComma,
136
- SemiColon, FullWidthPeriod,
137
- SpecialPeriod, FullWidthExclamation,
138
- ExclamationPoint, QuestionMark,
139
- FullWidthQuestionMark, MixedDoubleQE,
140
- MixedDoubleQQ, MixedDoubleEQ,
141
- MixedDoubleEE, LeftParens,
142
- RightParens, TemporaryEndingPunctutation,
143
- Newline ]
144
- end
145
-
146
- module EscapeRegexReservedCharacters
147
- LeftParen = Rule.new('(', '\\(')
148
- RightParen = Rule.new(')', '\\)')
149
- LeftBracket = Rule.new('[', '\\[')
150
- RightBracket = Rule.new(']', '\\]')
151
- Dash = Rule.new('-', '\\-')
152
-
153
- All = [ LeftParen, RightParen,
154
- LeftBracket, RightBracket, Dash ]
155
- end
156
-
157
- module SubEscapedRegexReservedCharacters
158
- SubLeftParen = Rule.new('\\(', '(')
159
- SubRightParen = Rule.new('\\)', ')')
160
- SubLeftBracket = Rule.new('\\[', '[')
161
- SubRightBracket = Rule.new('\\]', ']')
162
- SubDash = Rule.new('\\-', '-')
163
-
164
- All = [ SubLeftParen, SubRightParen,
165
- SubLeftBracket, SubRightBracket, SubDash ]
166
- end
167
- end
168
- end
@@ -1,35 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module PragmaticSegmenter
4
- module Rules
5
- # This class searches for ellipses within a string and
6
- # replaces the periods.
7
-
8
- # http://www.dailywritingtips.com/in-search-of-a-4-dot-ellipsis/
9
- # http://www.thepunctuationguide.com/ellipses.html
10
-
11
- module EllipsisRules
12
- # Rubular: http://rubular.com/r/i60hCK81fz
13
- ThreeConsecutiveRule = Rule.new(/\.\.\.(?=\s+[A-Z])/, '☏.')
14
-
15
- # Rubular: http://rubular.com/r/Hdqpd90owl
16
- FourConsecutiveRule = Rule.new(/(?<=\S)\.{3}(?=\.\s[A-Z])/, 'ƪ')
17
-
18
- # Rubular: http://rubular.com/r/YBG1dIHTRu
19
- ThreeSpaceRule = Rule.new(/(\s\.){3}\s/, '♟')
20
-
21
- # Rubular: http://rubular.com/r/2VvZ8wRbd8
22
- FourSpaceRule = Rule.new(/(?<=[a-z])(\.\s){3}\.(\z|$|\n)/, '♝')
23
-
24
- OtherThreePeriodRule = Rule.new(/\.\.\./, 'ƪ')
25
-
26
- All = [
27
- ThreeSpaceRule,
28
- FourSpaceRule,
29
- FourConsecutiveRule,
30
- ThreeConsecutiveRule,
31
- OtherThreePeriodRule
32
- ]
33
- end
34
- end
35
- end
@@ -1,13 +0,0 @@
1
- module PragmaticSegmenter
2
- module Rules
3
- module HTMLRules
4
- # Rubular: http://rubular.com/r/ENrVFMdJ8v
5
- HTMLTagRule = Rule.new(/<\/?[^>]*>/, '')
6
-
7
- # Rubular: http://rubular.com/r/XZVqMPJhea
8
- EscapedHTMLTagRule = Rule.new(/&lt;\/?[^gt;]*gt;/, '')
9
-
10
- All = [HTMLTagRule, EscapedHTMLTagRule]
11
- end
12
- end
13
- end