pristine_text 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +3 -3
- data/lib/pristine_text/version.rb +1 -1
- data/lib/pristine_text.rb +2 -1
- data/pristine_text.gemspec +1 -1
- data/test/pristine_text.rb +9 -0
- metadata +8 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 26c345b29f87af15925158a92b87951a401ccd1e
|
4
|
+
data.tar.gz: f6bfa2f8ac0574daa87f37eb2f5634a3656686fa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 124f737e2ed1d18e423fae1d12033767d657cb7b9c5281472baf2736e096efc41077b6515087a6498bae0114e861b48216924b365156fdb55d571f821fa01ec4
|
7
|
+
data.tar.gz: 7d9320dd6e6443b376af26bcc74ef83812ef60f881281216e90358d2dfa301bc97fd90a026004d2028cd13934b96f2ab7a61d2434285bd9c555431fa5d8ac234
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# PristineText
|
2
2
|
|
3
|
-
This gem uses unicode_utils to lowercase text, removes non-letters, strips and squeezes whitespace, then optionally stemwords (from
|
3
|
+
This gem uses unicode_utils to lowercase text, removes non-letters, strips and squeezes whitespace, then optionally uses stemwords (from libstemming-tools) to stem every word.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -20,9 +20,9 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
require "pristine_text"
|
23
|
+
require "pristine_text"
|
24
24
|
|
25
|
-
puts PristineText.clean("haberler geliyorlar gidiyorlar", :tr)
|
25
|
+
puts PristineText.clean("haberler geliyorlar gidiyorlar", :tr)
|
26
26
|
|
27
27
|
## Contributing
|
28
28
|
|
data/lib/pristine_text.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require "pristine_text/version"
|
2
2
|
require "open3"
|
3
3
|
require "unicode_utils"
|
4
|
+
require "cgi"
|
4
5
|
|
5
6
|
module PristineText
|
6
7
|
def self.pipe(text, locale)
|
@@ -23,7 +24,7 @@ module PristineText
|
|
23
24
|
end
|
24
25
|
|
25
26
|
def self.clean(text, locale= :en, stem= true)
|
26
|
-
text= UnicodeUtils.downcase(text, locale).
|
27
|
+
text= UnicodeUtils.downcase(CGI.unescapeHTML(text).gsub(/ /, "\n"), locale).
|
27
28
|
gsub(/[^\p{Letter}\s]+/, "").
|
28
29
|
strip.squeeze
|
29
30
|
if stem
|
data/pristine_text.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ["Nurettin Onur TUĞCU"]
|
10
10
|
spec.email = ["onurtugcu@gmail.com"]
|
11
11
|
spec.summary = %q{Lowercase, squeeze, stem text.}
|
12
|
-
spec.description = %q{This gem uses unicode_utils to lowercase text, removes non-letters, strips and squeezes whitespace, then optionally stemwords (from
|
12
|
+
spec.description = %q{This gem uses unicode_utils to lowercase text, removes non-letters, strips and squeezes whitespace, then optionally uses stemwords (from libstemming-tools) to stem every word.}
|
13
13
|
spec.homepage = "https://github.com/nurettin/pristine_text"
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
@@ -0,0 +1,9 @@
|
|
1
|
+
require "pristine_text"
|
2
|
+
require "minitest/autorun"
|
3
|
+
|
4
|
+
class PristineTextTest< Minitest::Unit::TestCase
|
5
|
+
def test_clean
|
6
|
+
assert_equal PristineText.clean("haberler geliyorlar gidiyorlar", :tr), "haber geliyor gidiyor"
|
7
|
+
assert_equal PristineText.clean("}{ÜĞ09 İŞi!'^+çö\n\t\v][';.,üğişçö"", :tr), "üğ işiçö üğişçö"
|
8
|
+
end
|
9
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pristine_text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nurettin Onur TUĞCU
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-11-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -53,8 +53,8 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
description: This gem uses unicode_utils to lowercase text, removes non-letters, strips
|
56
|
-
and squeezes whitespace, then optionally stemwords (from
|
57
|
-
every word.
|
56
|
+
and squeezes whitespace, then optionally uses stemwords (from libstemming-tools)
|
57
|
+
to stem every word.
|
58
58
|
email:
|
59
59
|
- onurtugcu@gmail.com
|
60
60
|
executables: []
|
@@ -69,6 +69,7 @@ files:
|
|
69
69
|
- lib/pristine_text.rb
|
70
70
|
- lib/pristine_text/version.rb
|
71
71
|
- pristine_text.gemspec
|
72
|
+
- test/pristine_text.rb
|
72
73
|
homepage: https://github.com/nurettin/pristine_text
|
73
74
|
licenses:
|
74
75
|
- MIT
|
@@ -89,8 +90,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
89
90
|
version: '0'
|
90
91
|
requirements: []
|
91
92
|
rubyforge_project:
|
92
|
-
rubygems_version: 2.
|
93
|
+
rubygems_version: 2.4.2
|
93
94
|
signing_key:
|
94
95
|
specification_version: 4
|
95
96
|
summary: Lowercase, squeeze, stem text.
|
96
|
-
test_files:
|
97
|
+
test_files:
|
98
|
+
- test/pristine_text.rb
|