pragmatic_segmenter 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -2
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +16 -51
- data/lib/pragmatic_segmenter/cleaner.rb +18 -99
- data/lib/pragmatic_segmenter/languages.rb +62 -0
- data/lib/pragmatic_segmenter/languages/amharic.rb +4 -30
- data/lib/pragmatic_segmenter/languages/arabic.rb +21 -64
- data/lib/pragmatic_segmenter/languages/armenian.rb +4 -30
- data/lib/pragmatic_segmenter/languages/burmese.rb +4 -30
- data/lib/pragmatic_segmenter/languages/chinese.rb +8 -0
- data/lib/pragmatic_segmenter/languages/common.rb +70 -1
- data/lib/pragmatic_segmenter/languages/deutsch.rb +49 -78
- data/lib/pragmatic_segmenter/languages/dutch.rb +5 -36
- data/lib/pragmatic_segmenter/languages/english.rb +3 -12
- data/lib/pragmatic_segmenter/languages/french.rb +5 -32
- data/lib/pragmatic_segmenter/languages/greek.rb +4 -26
- data/lib/pragmatic_segmenter/languages/hindi.rb +4 -30
- data/lib/pragmatic_segmenter/languages/italian.rb +3 -37
- data/lib/pragmatic_segmenter/languages/japanese.rb +6 -4
- data/lib/pragmatic_segmenter/languages/persian.rb +16 -40
- data/lib/pragmatic_segmenter/languages/polish.rb +6 -38
- data/lib/pragmatic_segmenter/languages/russian.rb +13 -33
- data/lib/pragmatic_segmenter/languages/spanish.rb +6 -31
- data/lib/pragmatic_segmenter/languages/urdu.rb +4 -30
- data/lib/pragmatic_segmenter/number.rb +5 -5
- data/lib/pragmatic_segmenter/process.rb +28 -49
- data/lib/pragmatic_segmenter/rules.rb +65 -1
- data/lib/pragmatic_segmenter/{ellipsis.rb → rules/ellipsis.rb} +0 -0
- data/lib/pragmatic_segmenter/rules/html.rb +13 -0
- data/lib/pragmatic_segmenter/segmenter.rb +12 -32
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +6 -7
- metadata +6 -8
- data/lib/pragmatic_segmenter/abbreviation.rb +0 -22
- data/lib/pragmatic_segmenter/language_support.rb +0 -31
- data/lib/pragmatic_segmenter/punctuation.rb +0 -12
- data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +0 -17
- data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +0 -37
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-05-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -70,17 +70,16 @@ files:
|
|
70
70
|
- README.md
|
71
71
|
- Rakefile
|
72
72
|
- lib/pragmatic_segmenter.rb
|
73
|
-
- lib/pragmatic_segmenter/abbreviation.rb
|
74
73
|
- lib/pragmatic_segmenter/abbreviation_replacer.rb
|
75
74
|
- lib/pragmatic_segmenter/between_punctuation.rb
|
76
75
|
- lib/pragmatic_segmenter/cleaner.rb
|
77
|
-
- lib/pragmatic_segmenter/ellipsis.rb
|
78
76
|
- lib/pragmatic_segmenter/exclamation_words.rb
|
79
|
-
- lib/pragmatic_segmenter/
|
77
|
+
- lib/pragmatic_segmenter/languages.rb
|
80
78
|
- lib/pragmatic_segmenter/languages/amharic.rb
|
81
79
|
- lib/pragmatic_segmenter/languages/arabic.rb
|
82
80
|
- lib/pragmatic_segmenter/languages/armenian.rb
|
83
81
|
- lib/pragmatic_segmenter/languages/burmese.rb
|
82
|
+
- lib/pragmatic_segmenter/languages/chinese.rb
|
84
83
|
- lib/pragmatic_segmenter/languages/common.rb
|
85
84
|
- lib/pragmatic_segmenter/languages/deutsch.rb
|
86
85
|
- lib/pragmatic_segmenter/languages/dutch.rb
|
@@ -98,12 +97,11 @@ files:
|
|
98
97
|
- lib/pragmatic_segmenter/list.rb
|
99
98
|
- lib/pragmatic_segmenter/number.rb
|
100
99
|
- lib/pragmatic_segmenter/process.rb
|
101
|
-
- lib/pragmatic_segmenter/punctuation.rb
|
102
100
|
- lib/pragmatic_segmenter/punctuation_replacer.rb
|
103
101
|
- lib/pragmatic_segmenter/rules.rb
|
102
|
+
- lib/pragmatic_segmenter/rules/ellipsis.rb
|
103
|
+
- lib/pragmatic_segmenter/rules/html.rb
|
104
104
|
- lib/pragmatic_segmenter/segmenter.rb
|
105
|
-
- lib/pragmatic_segmenter/sentence_boundary_punctuation.rb
|
106
|
-
- lib/pragmatic_segmenter/single_letter_abbreviation.rb
|
107
105
|
- lib/pragmatic_segmenter/types.rb
|
108
106
|
- lib/pragmatic_segmenter/version.rb
|
109
107
|
- pragmatic_segmenter.gemspec
|
@@ -1,22 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module PragmaticSegmenter
|
4
|
-
# Defines the abbreviations for each language (if available)
|
5
|
-
class Abbreviation
|
6
|
-
ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
|
7
|
-
PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
|
8
|
-
NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
|
9
|
-
|
10
|
-
def all
|
11
|
-
ABBREVIATIONS
|
12
|
-
end
|
13
|
-
|
14
|
-
def prepositive
|
15
|
-
PREPOSITIVE_ABBREVIATIONS
|
16
|
-
end
|
17
|
-
|
18
|
-
def number
|
19
|
-
NUMBER_ABBREVIATIONS
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
@@ -1,31 +0,0 @@
|
|
1
|
-
module PragmaticSegmenter
|
2
|
-
module LanguageSupport
|
3
|
-
LANGUAGE_CODES = {
|
4
|
-
'en' => 'English',
|
5
|
-
'de' => 'Deutsch',
|
6
|
-
'es' => 'Spanish',
|
7
|
-
'fr' => 'French',
|
8
|
-
'it' => 'Italian',
|
9
|
-
'ja' => 'Japanese',
|
10
|
-
'el' => 'Greek',
|
11
|
-
'ru' => 'Russian',
|
12
|
-
'ar' => 'Arabic',
|
13
|
-
'am' => 'Amharic',
|
14
|
-
'hi' => 'Hindi',
|
15
|
-
'hy' => 'Armenian',
|
16
|
-
'fa' => 'Persian',
|
17
|
-
'my' => 'Burmese',
|
18
|
-
'ur' => 'Urdu',
|
19
|
-
'nl' => 'Dutch',
|
20
|
-
'pl' => 'Polish',
|
21
|
-
}
|
22
|
-
|
23
|
-
def process_class
|
24
|
-
Object.const_get("PragmaticSegmenter::Languages::#{LANGUAGE_CODES[language] || 'Common'}::Process")
|
25
|
-
end
|
26
|
-
|
27
|
-
def cleaner_class
|
28
|
-
Object.const_get("PragmaticSegmenter::Languages::#{LANGUAGE_CODES[language] || 'Common'}::Cleaner")
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
@@ -1,17 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module PragmaticSegmenter
|
4
|
-
# This class splits text at sentence boundary punctuation marks
|
5
|
-
class SentenceBoundaryPunctuation
|
6
|
-
SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
|
7
|
-
|
8
|
-
attr_reader :text
|
9
|
-
def initialize(text:)
|
10
|
-
@text = text
|
11
|
-
end
|
12
|
-
|
13
|
-
def split
|
14
|
-
text.scan(SENTENCE_BOUNDARY_REGEX)
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
@@ -1,37 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module PragmaticSegmenter
|
4
|
-
# This class searches for periods within an abbreviation and
|
5
|
-
# replaces the periods.
|
6
|
-
class SingleLetterAbbreviation
|
7
|
-
# Rubular: http://rubular.com/r/e3H6kwnr6H
|
8
|
-
SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
|
9
|
-
|
10
|
-
# Rubular: http://rubular.com/r/gitvf0YWH4
|
11
|
-
SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=\s)/, '∯')
|
12
|
-
|
13
|
-
attr_reader :text
|
14
|
-
def initialize(text:)
|
15
|
-
@text = text
|
16
|
-
end
|
17
|
-
|
18
|
-
def replace
|
19
|
-
@formatted_text = replace_single_letter_abbreviations(text)
|
20
|
-
end
|
21
|
-
|
22
|
-
private
|
23
|
-
|
24
|
-
def replace_single_letter_abbreviations(txt)
|
25
|
-
new_text = replace_single_uppercase_letter_abbreviation_at_start_of_line(txt)
|
26
|
-
replace_single_uppercase_letter_abbreviation(new_text)
|
27
|
-
end
|
28
|
-
|
29
|
-
def replace_single_uppercase_letter_abbreviation_at_start_of_line(txt)
|
30
|
-
txt.apply(SingleUpperCaseLetterAtStartOfLineRule)
|
31
|
-
end
|
32
|
-
|
33
|
-
def replace_single_uppercase_letter_abbreviation(txt)
|
34
|
-
txt.apply(SingleUpperCaseLetterRule)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|