pristine_text 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 03ce1c942c7e32460cf0c312a5d897a205f4a2a7
4
+ data.tar.gz: 67916011e5695da1ec0c521e5f693215f3c82238
5
+ SHA512:
6
+ metadata.gz: c43004bf30849ab031a6c314a52bad0b863c1591304c082e5761dccc55f0bf658e9ea855a6a46e03c6e35412355c62347c494c2b4f8f25ec30a21a2dce10d830
7
+ data.tar.gz: 43a0bbcb713ab367ccd531448a82a101de22af9f85fe6fdd5acf2556918533cb647ad575f2af790139cdfe89a62f1408486f124d675e236e0129ae886522d015
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in pristine_text.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Nurettin Onur TUĞCU
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,33 @@
1
+ # PristineText
2
+
3
+ This gem uses unicode_utils to lowercase text, removes non-letters, strips and squeezes whitespace, then optionally stemwords (from stemming-tools) to stem every word.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'pristine_text'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install pristine_text
20
+
21
+ ## Usage
22
+
23
+ require "pristine_text"
24
+
25
+ puts PristineText.clean("haberler geliyorlar gidiyorlar", :tr)
26
+
27
+ ## Contributing
28
+
29
+ 1. Fork it ( https://github.com/[my-github-username]/pristine_text/fork )
30
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
31
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
32
+ 4. Push to the branch (`git push origin my-new-feature`)
33
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs= FileList['lib/**/*'].select{ |d| File.directory?(d) }
5
+ t.test_files = FileList['test/**/*'].reject{ |d| File.directory?(d) }
6
+ end
7
+
@@ -0,0 +1,3 @@
1
+ module PristineText
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,35 @@
1
+ require "pristine_text/version"
2
+ require "open3"
3
+ require "unicode_utils"
4
+
5
+ module PristineText
6
+ def self.pipe(text, locale)
7
+ if `which stemwords`.empty?
8
+ raise LoadError.new("cannot find stemwords, install libstemmer-tools")
9
+ end
10
+ Open3.popen3("stemwords -l #{locale}") do |i, o, e, t|
11
+ i.write text
12
+ i.close
13
+ o.read.strip
14
+ end
15
+ end
16
+
17
+ def self.stem(text, locale)
18
+ if text.kind_of?(Array)
19
+ pipe(text.join("\n"), locale).split("\n")
20
+ elsif text.kind_of?(String)
21
+ pipe text, locale
22
+ end
23
+ end
24
+
25
+ def self.clean(text, locale= :en, stem= true)
26
+ text= UnicodeUtils.downcase(text, locale).
27
+ gsub(/[^\p{Letter}\s]+/, "").
28
+ strip.squeeze
29
+ if stem
30
+ stem(text.split, locale).join " "
31
+ else
32
+ text
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'pristine_text/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "pristine_text"
8
+ spec.version = PristineText::VERSION
9
+ spec.authors = ["Nurettin Onur TUĞCU"]
10
+ spec.email = ["onurtugcu@gmail.com"]
11
+ spec.summary = %q{Lowercase, squeeze, stem text.}
12
+ spec.description = %q{This gem uses unicode_utils to lowercase text, removes non-letters, strips and squeezes whitespace, then optionally stemwords (from stemming-tools) to stem every word.}
13
+ spec.homepage = "https://github.com/nurettin/pristine_text"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_dependency 'unicode_utils', '~> 0'
24
+ spec.required_ruby_version= '~> 2.1'
25
+ end
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pristine_text
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Nurettin Onur TUĞCU
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-10-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: unicode_utils
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: This gem uses unicode_utils to lowercase text, removes non-letters, strips
56
+ and squeezes whitespace, then optionally stemwords (from stemming-tools) to stem
57
+ every word.
58
+ email:
59
+ - onurtugcu@gmail.com
60
+ executables: []
61
+ extensions: []
62
+ extra_rdoc_files: []
63
+ files:
64
+ - ".gitignore"
65
+ - Gemfile
66
+ - LICENSE.txt
67
+ - README.md
68
+ - Rakefile
69
+ - lib/pristine_text.rb
70
+ - lib/pristine_text/version.rb
71
+ - pristine_text.gemspec
72
+ homepage: https://github.com/nurettin/pristine_text
73
+ licenses:
74
+ - MIT
75
+ metadata: {}
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - "~>"
83
+ - !ruby/object:Gem::Version
84
+ version: '2.1'
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ requirements: []
91
+ rubyforge_project:
92
+ rubygems_version: 2.2.2
93
+ signing_key:
94
+ specification_version: 4
95
+ summary: Lowercase, squeeze, stem text.
96
+ test_files: []