pragmatic_segmenter 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +1 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +730 -0
  7. data/Rakefile +4 -0
  8. data/lib/pragmatic_segmenter.rb +2 -0
  9. data/lib/pragmatic_segmenter/abbreviation.rb +22 -0
  10. data/lib/pragmatic_segmenter/abbreviation_replacer.rb +149 -0
  11. data/lib/pragmatic_segmenter/between_punctuation.rb +78 -0
  12. data/lib/pragmatic_segmenter/cleaner.rb +141 -0
  13. data/lib/pragmatic_segmenter/ellipsis.rb +36 -0
  14. data/lib/pragmatic_segmenter/exclamation_words.rb +19 -0
  15. data/lib/pragmatic_segmenter/languages/amharic.rb +33 -0
  16. data/lib/pragmatic_segmenter/languages/arabic.rb +83 -0
  17. data/lib/pragmatic_segmenter/languages/armenian.rb +33 -0
  18. data/lib/pragmatic_segmenter/languages/burmese.rb +33 -0
  19. data/lib/pragmatic_segmenter/languages/deutsch.rb +132 -0
  20. data/lib/pragmatic_segmenter/languages/english.rb +44 -0
  21. data/lib/pragmatic_segmenter/languages/french.rb +29 -0
  22. data/lib/pragmatic_segmenter/languages/greek.rb +29 -0
  23. data/lib/pragmatic_segmenter/languages/hindi.rb +33 -0
  24. data/lib/pragmatic_segmenter/languages/italian.rb +39 -0
  25. data/lib/pragmatic_segmenter/languages/japanese.rb +58 -0
  26. data/lib/pragmatic_segmenter/languages/persian.rb +56 -0
  27. data/lib/pragmatic_segmenter/languages/russian.rb +60 -0
  28. data/lib/pragmatic_segmenter/languages/spanish.rb +39 -0
  29. data/lib/pragmatic_segmenter/languages/urdu.rb +33 -0
  30. data/lib/pragmatic_segmenter/list.rb +169 -0
  31. data/lib/pragmatic_segmenter/number.rb +35 -0
  32. data/lib/pragmatic_segmenter/process.rb +126 -0
  33. data/lib/pragmatic_segmenter/punctuation.rb +12 -0
  34. data/lib/pragmatic_segmenter/punctuation_replacer.rb +62 -0
  35. data/lib/pragmatic_segmenter/rules.rb +38 -0
  36. data/lib/pragmatic_segmenter/segmenter.rb +81 -0
  37. data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +17 -0
  38. data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +37 -0
  39. data/lib/pragmatic_segmenter/types.rb +12 -0
  40. data/lib/pragmatic_segmenter/version.rb +3 -0
  41. data/pragmatic_segmenter.gemspec +25 -0
  42. data/spec/performance_spec.rb +24 -0
  43. data/spec/pragmatic_segmenter_spec.rb +1906 -0
  44. data/spec/spec_helper.rb +1 -0
  45. metadata +150 -0
@@ -0,0 +1 @@
1
+ require 'pragmatic_segmenter'
metadata ADDED
@@ -0,0 +1,150 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pragmatic_segmenter
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Kevin S. Dias
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-01-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rubocop
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: 'Pragmatic Segmenter is a sentence segmentation tool for Ruby. It allows
70
+ you to split a text into an array of sentences. This gem provides 2 main benefits
71
+ over other segmentation gems - 1) It works well even with ill-formatted text 2)
72
+ It works for multiple languages '
73
+ email:
74
+ - diasks2@gmail.com
75
+ executables: []
76
+ extensions: []
77
+ extra_rdoc_files: []
78
+ files:
79
+ - ".gitignore"
80
+ - ".rspec"
81
+ - Gemfile
82
+ - LICENSE.txt
83
+ - README.md
84
+ - Rakefile
85
+ - lib/pragmatic_segmenter.rb
86
+ - lib/pragmatic_segmenter/abbreviation.rb
87
+ - lib/pragmatic_segmenter/abbreviation_replacer.rb
88
+ - lib/pragmatic_segmenter/between_punctuation.rb
89
+ - lib/pragmatic_segmenter/cleaner.rb
90
+ - lib/pragmatic_segmenter/ellipsis.rb
91
+ - lib/pragmatic_segmenter/exclamation_words.rb
92
+ - lib/pragmatic_segmenter/languages/amharic.rb
93
+ - lib/pragmatic_segmenter/languages/arabic.rb
94
+ - lib/pragmatic_segmenter/languages/armenian.rb
95
+ - lib/pragmatic_segmenter/languages/burmese.rb
96
+ - lib/pragmatic_segmenter/languages/deutsch.rb
97
+ - lib/pragmatic_segmenter/languages/english.rb
98
+ - lib/pragmatic_segmenter/languages/french.rb
99
+ - lib/pragmatic_segmenter/languages/greek.rb
100
+ - lib/pragmatic_segmenter/languages/hindi.rb
101
+ - lib/pragmatic_segmenter/languages/italian.rb
102
+ - lib/pragmatic_segmenter/languages/japanese.rb
103
+ - lib/pragmatic_segmenter/languages/persian.rb
104
+ - lib/pragmatic_segmenter/languages/russian.rb
105
+ - lib/pragmatic_segmenter/languages/spanish.rb
106
+ - lib/pragmatic_segmenter/languages/urdu.rb
107
+ - lib/pragmatic_segmenter/list.rb
108
+ - lib/pragmatic_segmenter/number.rb
109
+ - lib/pragmatic_segmenter/process.rb
110
+ - lib/pragmatic_segmenter/punctuation.rb
111
+ - lib/pragmatic_segmenter/punctuation_replacer.rb
112
+ - lib/pragmatic_segmenter/rules.rb
113
+ - lib/pragmatic_segmenter/segmenter.rb
114
+ - lib/pragmatic_segmenter/sentence_boundary_punctuation.rb
115
+ - lib/pragmatic_segmenter/single_letter_abbreviation.rb
116
+ - lib/pragmatic_segmenter/types.rb
117
+ - lib/pragmatic_segmenter/version.rb
118
+ - pragmatic_segmenter.gemspec
119
+ - spec/performance_spec.rb
120
+ - spec/pragmatic_segmenter_spec.rb
121
+ - spec/spec_helper.rb
122
+ homepage: https://github.com/diasks2/pragmatic_segmenter
123
+ licenses:
124
+ - MIT
125
+ metadata: {}
126
+ post_install_message:
127
+ rdoc_options: []
128
+ require_paths:
129
+ - lib
130
+ required_ruby_version: !ruby/object:Gem::Requirement
131
+ requirements:
132
+ - - ">="
133
+ - !ruby/object:Gem::Version
134
+ version: '0'
135
+ required_rubygems_version: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - ">="
138
+ - !ruby/object:Gem::Version
139
+ version: '0'
140
+ requirements: []
141
+ rubyforge_project:
142
+ rubygems_version: 2.4.1
143
+ signing_key:
144
+ specification_version: 4
145
+ summary: A rule-based sentence boundary detection gem that works out-of-the-box across
146
+ many languages
147
+ test_files:
148
+ - spec/performance_spec.rb
149
+ - spec/pragmatic_segmenter_spec.rb
150
+ - spec/spec_helper.rb