pragmatic_segmenter 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +8 -2
  3. data/lib/pragmatic_segmenter/abbreviation_replacer.rb +16 -51
  4. data/lib/pragmatic_segmenter/cleaner.rb +18 -99
  5. data/lib/pragmatic_segmenter/languages.rb +62 -0
  6. data/lib/pragmatic_segmenter/languages/amharic.rb +4 -30
  7. data/lib/pragmatic_segmenter/languages/arabic.rb +21 -64
  8. data/lib/pragmatic_segmenter/languages/armenian.rb +4 -30
  9. data/lib/pragmatic_segmenter/languages/burmese.rb +4 -30
  10. data/lib/pragmatic_segmenter/languages/chinese.rb +8 -0
  11. data/lib/pragmatic_segmenter/languages/common.rb +70 -1
  12. data/lib/pragmatic_segmenter/languages/deutsch.rb +49 -78
  13. data/lib/pragmatic_segmenter/languages/dutch.rb +5 -36
  14. data/lib/pragmatic_segmenter/languages/english.rb +3 -12
  15. data/lib/pragmatic_segmenter/languages/french.rb +5 -32
  16. data/lib/pragmatic_segmenter/languages/greek.rb +4 -26
  17. data/lib/pragmatic_segmenter/languages/hindi.rb +4 -30
  18. data/lib/pragmatic_segmenter/languages/italian.rb +3 -37
  19. data/lib/pragmatic_segmenter/languages/japanese.rb +6 -4
  20. data/lib/pragmatic_segmenter/languages/persian.rb +16 -40
  21. data/lib/pragmatic_segmenter/languages/polish.rb +6 -38
  22. data/lib/pragmatic_segmenter/languages/russian.rb +13 -33
  23. data/lib/pragmatic_segmenter/languages/spanish.rb +6 -31
  24. data/lib/pragmatic_segmenter/languages/urdu.rb +4 -30
  25. data/lib/pragmatic_segmenter/number.rb +5 -5
  26. data/lib/pragmatic_segmenter/process.rb +28 -49
  27. data/lib/pragmatic_segmenter/rules.rb +65 -1
  28. data/lib/pragmatic_segmenter/{ellipsis.rb → rules/ellipsis.rb} +0 -0
  29. data/lib/pragmatic_segmenter/rules/html.rb +13 -0
  30. data/lib/pragmatic_segmenter/segmenter.rb +12 -32
  31. data/lib/pragmatic_segmenter/version.rb +1 -1
  32. data/spec/pragmatic_segmenter_spec.rb +6 -7
  33. metadata +6 -8
  34. data/lib/pragmatic_segmenter/abbreviation.rb +0 -22
  35. data/lib/pragmatic_segmenter/language_support.rb +0 -31
  36. data/lib/pragmatic_segmenter/punctuation.rb +0 -12
  37. data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +0 -17
  38. data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +0 -37
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-02 00:00:00.000000000 Z
11
+ date: 2015-05-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -70,17 +70,16 @@ files:
70
70
  - README.md
71
71
  - Rakefile
72
72
  - lib/pragmatic_segmenter.rb
73
- - lib/pragmatic_segmenter/abbreviation.rb
74
73
  - lib/pragmatic_segmenter/abbreviation_replacer.rb
75
74
  - lib/pragmatic_segmenter/between_punctuation.rb
76
75
  - lib/pragmatic_segmenter/cleaner.rb
77
- - lib/pragmatic_segmenter/ellipsis.rb
78
76
  - lib/pragmatic_segmenter/exclamation_words.rb
79
- - lib/pragmatic_segmenter/language_support.rb
77
+ - lib/pragmatic_segmenter/languages.rb
80
78
  - lib/pragmatic_segmenter/languages/amharic.rb
81
79
  - lib/pragmatic_segmenter/languages/arabic.rb
82
80
  - lib/pragmatic_segmenter/languages/armenian.rb
83
81
  - lib/pragmatic_segmenter/languages/burmese.rb
82
+ - lib/pragmatic_segmenter/languages/chinese.rb
84
83
  - lib/pragmatic_segmenter/languages/common.rb
85
84
  - lib/pragmatic_segmenter/languages/deutsch.rb
86
85
  - lib/pragmatic_segmenter/languages/dutch.rb
@@ -98,12 +97,11 @@ files:
98
97
  - lib/pragmatic_segmenter/list.rb
99
98
  - lib/pragmatic_segmenter/number.rb
100
99
  - lib/pragmatic_segmenter/process.rb
101
- - lib/pragmatic_segmenter/punctuation.rb
102
100
  - lib/pragmatic_segmenter/punctuation_replacer.rb
103
101
  - lib/pragmatic_segmenter/rules.rb
102
+ - lib/pragmatic_segmenter/rules/ellipsis.rb
103
+ - lib/pragmatic_segmenter/rules/html.rb
104
104
  - lib/pragmatic_segmenter/segmenter.rb
105
- - lib/pragmatic_segmenter/sentence_boundary_punctuation.rb
106
- - lib/pragmatic_segmenter/single_letter_abbreviation.rb
107
105
  - lib/pragmatic_segmenter/types.rb
108
106
  - lib/pragmatic_segmenter/version.rb
109
107
  - pragmatic_segmenter.gemspec
@@ -1,22 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module PragmaticSegmenter
4
- # Defines the abbreviations for each language (if available)
5
- class Abbreviation
6
- ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
7
- PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
8
- NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
9
-
10
- def all
11
- ABBREVIATIONS
12
- end
13
-
14
- def prepositive
15
- PREPOSITIVE_ABBREVIATIONS
16
- end
17
-
18
- def number
19
- NUMBER_ABBREVIATIONS
20
- end
21
- end
22
- end
@@ -1,31 +0,0 @@
1
- module PragmaticSegmenter
2
- module LanguageSupport
3
- LANGUAGE_CODES = {
4
- 'en' => 'English',
5
- 'de' => 'Deutsch',
6
- 'es' => 'Spanish',
7
- 'fr' => 'French',
8
- 'it' => 'Italian',
9
- 'ja' => 'Japanese',
10
- 'el' => 'Greek',
11
- 'ru' => 'Russian',
12
- 'ar' => 'Arabic',
13
- 'am' => 'Amharic',
14
- 'hi' => 'Hindi',
15
- 'hy' => 'Armenian',
16
- 'fa' => 'Persian',
17
- 'my' => 'Burmese',
18
- 'ur' => 'Urdu',
19
- 'nl' => 'Dutch',
20
- 'pl' => 'Polish',
21
- }
22
-
23
- def process_class
24
- Object.const_get("PragmaticSegmenter::Languages::#{LANGUAGE_CODES[language] || 'Common'}::Process")
25
- end
26
-
27
- def cleaner_class
28
- Object.const_get("PragmaticSegmenter::Languages::#{LANGUAGE_CODES[language] || 'Common'}::Cleaner")
29
- end
30
- end
31
- end
@@ -1,12 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module PragmaticSegmenter
4
- # This class holds the punctuation marks.
5
- class Punctuation
6
- PUNCT = ['。', '.', '.', '!', '!', '?', '?']
7
-
8
- def punct
9
- PUNCT
10
- end
11
- end
12
- end
@@ -1,17 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module PragmaticSegmenter
4
- # This class splits text at sentence boundary punctuation marks
5
- class SentenceBoundaryPunctuation
6
- SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
7
-
8
- attr_reader :text
9
- def initialize(text:)
10
- @text = text
11
- end
12
-
13
- def split
14
- text.scan(SENTENCE_BOUNDARY_REGEX)
15
- end
16
- end
17
- end
@@ -1,37 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module PragmaticSegmenter
4
- # This class searches for periods within an abbreviation and
5
- # replaces the periods.
6
- class SingleLetterAbbreviation
7
- # Rubular: http://rubular.com/r/e3H6kwnr6H
8
- SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
9
-
10
- # Rubular: http://rubular.com/r/gitvf0YWH4
11
- SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=\s)/, '∯')
12
-
13
- attr_reader :text
14
- def initialize(text:)
15
- @text = text
16
- end
17
-
18
- def replace
19
- @formatted_text = replace_single_letter_abbreviations(text)
20
- end
21
-
22
- private
23
-
24
- def replace_single_letter_abbreviations(txt)
25
- new_text = replace_single_uppercase_letter_abbreviation_at_start_of_line(txt)
26
- replace_single_uppercase_letter_abbreviation(new_text)
27
- end
28
-
29
- def replace_single_uppercase_letter_abbreviation_at_start_of_line(txt)
30
- txt.apply(SingleUpperCaseLetterAtStartOfLineRule)
31
- end
32
-
33
- def replace_single_uppercase_letter_abbreviation(txt)
34
- txt.apply(SingleUpperCaseLetterRule)
35
- end
36
- end
37
- end