pristine_text 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +3 -3
- data/lib/pristine_text/version.rb +1 -1
- data/lib/pristine_text.rb +2 -1
- data/pristine_text.gemspec +1 -1
- data/test/pristine_text.rb +9 -0
- metadata +8 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 26c345b29f87af15925158a92b87951a401ccd1e
|
4
|
+
data.tar.gz: f6bfa2f8ac0574daa87f37eb2f5634a3656686fa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 124f737e2ed1d18e423fae1d12033767d657cb7b9c5281472baf2736e096efc41077b6515087a6498bae0114e861b48216924b365156fdb55d571f821fa01ec4
|
7
|
+
data.tar.gz: 7d9320dd6e6443b376af26bcc74ef83812ef60f881281216e90358d2dfa301bc97fd90a026004d2028cd13934b96f2ab7a61d2434285bd9c555431fa5d8ac234
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# PristineText
|
2
2
|
|
3
|
-
This gem uses unicode_utils to lowercase text, removes non-letters, strips and squeezes whitespace, then optionally stemwords (from
|
3
|
+
This gem uses unicode_utils to lowercase text, removes non-letters, strips and squeezes whitespace, then optionally uses stemwords (from libstemming-tools) to stem every word.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -20,9 +20,9 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
require "pristine_text"
|
23
|
+
require "pristine_text"
|
24
24
|
|
25
|
-
puts PristineText.clean("haberler geliyorlar gidiyorlar", :tr)
|
25
|
+
puts PristineText.clean("haberler geliyorlar gidiyorlar", :tr)
|
26
26
|
|
27
27
|
## Contributing
|
28
28
|
|
data/lib/pristine_text.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require "pristine_text/version"
|
2
2
|
require "open3"
|
3
3
|
require "unicode_utils"
|
4
|
+
require "cgi"
|
4
5
|
|
5
6
|
module PristineText
|
6
7
|
def self.pipe(text, locale)
|
@@ -23,7 +24,7 @@ module PristineText
|
|
23
24
|
end
|
24
25
|
|
25
26
|
def self.clean(text, locale= :en, stem= true)
|
26
|
-
text= UnicodeUtils.downcase(text, locale).
|
27
|
+
text= UnicodeUtils.downcase(CGI.unescapeHTML(text).gsub(/ /, "\n"), locale).
|
27
28
|
gsub(/[^\p{Letter}\s]+/, "").
|
28
29
|
strip.squeeze
|
29
30
|
if stem
|
data/pristine_text.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ["Nurettin Onur TUĞCU"]
|
10
10
|
spec.email = ["onurtugcu@gmail.com"]
|
11
11
|
spec.summary = %q{Lowercase, squeeze, stem text.}
|
12
|
-
spec.description = %q{This gem uses unicode_utils to lowercase text, removes non-letters, strips and squeezes whitespace, then optionally stemwords (from
|
12
|
+
spec.description = %q{This gem uses unicode_utils to lowercase text, removes non-letters, strips and squeezes whitespace, then optionally uses stemwords (from libstemming-tools) to stem every word.}
|
13
13
|
spec.homepage = "https://github.com/nurettin/pristine_text"
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
@@ -0,0 +1,9 @@
|
|
1
|
+
require "pristine_text"
|
2
|
+
require "minitest/autorun"
|
3
|
+
|
4
|
+
class PristineTextTest< Minitest::Unit::TestCase
|
5
|
+
def test_clean
|
6
|
+
assert_equal PristineText.clean("haberler geliyorlar gidiyorlar", :tr), "haber geliyor gidiyor"
|
7
|
+
assert_equal PristineText.clean("}{ÜĞ09 İŞi!'^+çö\n\t\v][';.,üğişçö"", :tr), "üğ işiçö üğişçö"
|
8
|
+
end
|
9
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pristine_text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nurettin Onur TUĞCU
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-11-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -53,8 +53,8 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
description: This gem uses unicode_utils to lowercase text, removes non-letters, strips
|
56
|
-
and squeezes whitespace, then optionally stemwords (from
|
57
|
-
every word.
|
56
|
+
and squeezes whitespace, then optionally uses stemwords (from libstemming-tools)
|
57
|
+
to stem every word.
|
58
58
|
email:
|
59
59
|
- onurtugcu@gmail.com
|
60
60
|
executables: []
|
@@ -69,6 +69,7 @@ files:
|
|
69
69
|
- lib/pristine_text.rb
|
70
70
|
- lib/pristine_text/version.rb
|
71
71
|
- pristine_text.gemspec
|
72
|
+
- test/pristine_text.rb
|
72
73
|
homepage: https://github.com/nurettin/pristine_text
|
73
74
|
licenses:
|
74
75
|
- MIT
|
@@ -89,8 +90,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
89
90
|
version: '0'
|
90
91
|
requirements: []
|
91
92
|
rubyforge_project:
|
92
|
-
rubygems_version: 2.
|
93
|
+
rubygems_version: 2.4.2
|
93
94
|
signing_key:
|
94
95
|
specification_version: 4
|
95
96
|
summary: Lowercase, squeeze, stem text.
|
96
|
-
test_files:
|
97
|
+
test_files:
|
98
|
+
- test/pristine_text.rb
|