pristine_text 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pristine_text.rb +2 -2
- data/lib/pristine_text/version.rb +1 -1
- data/pristine_text.gemspec +2 -1
- data/test/pristine_text.rb +2 -1
- metadata +18 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aaae999b5981fa51e5d31efeaf45c35e061b3e46
|
4
|
+
data.tar.gz: 345716e8f916ae88766fed3f342690ea7f8a18ff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 22f76bfcc47e5233a13ded3e9c82304c19c8b8656e02fd4e184e37ad0a3c12b95471b8fa3b877202659fe69a75c791e6f56f1019fed5b66dcbebb8cf966d6b7f
|
7
|
+
data.tar.gz: 99a2c9034af5efea3c4be578aecab883553350c210c6df78e1d54bbb3e28499c21b7a55dc7b0b1dd306be1640d2c2739f987f62c1283455382005bdefd29a563
|
data/lib/pristine_text.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require "pristine_text/version"
|
2
2
|
require "open3"
|
3
3
|
require "unicode_utils"
|
4
|
-
require "
|
4
|
+
require "htmlentities"
|
5
5
|
|
6
6
|
module PristineText
|
7
7
|
def self.pipe(text, locale)
|
@@ -24,7 +24,7 @@ module PristineText
|
|
24
24
|
end
|
25
25
|
|
26
26
|
def self.clean(text, locale= :en, stem= true)
|
27
|
-
text= UnicodeUtils.downcase(
|
27
|
+
text= UnicodeUtils.downcase(HTMLEntities.new.decode(text), locale).
|
28
28
|
gsub(/[^\p{Letter}\s]+/, "").
|
29
29
|
strip.squeeze
|
30
30
|
if stem
|
data/pristine_text.gemspec
CHANGED
@@ -20,6 +20,7 @@ Gem::Specification.new do |spec|
|
|
20
20
|
|
21
21
|
spec.add_development_dependency "bundler", "~> 1.7"
|
22
22
|
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
-
spec.add_dependency 'unicode_utils', '~>
|
23
|
+
spec.add_dependency 'unicode_utils', '~> 1.4'
|
24
|
+
spec.add_dependency 'htmlentities', '~> 4.3'
|
24
25
|
spec.required_ruby_version= '~> 2.1'
|
25
26
|
end
|
data/test/pristine_text.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
|
+
#encoding: UTF-8
|
1
2
|
require "pristine_text"
|
2
3
|
require "minitest/autorun"
|
3
4
|
|
4
5
|
class PristineTextTest< Minitest::Unit::TestCase
|
5
6
|
def test_clean
|
6
7
|
assert_equal PristineText.clean("haberler geliyorlar gidiyorlar", :tr), "haber geliyor gidiyor"
|
7
|
-
assert_equal PristineText.clean("}{ÜĞ09 İŞi!'^+çö\n\t\v][';.,üğişçö"", :tr), "
|
8
|
+
assert_equal PristineText.clean("}{ÜĞ09 İŞi!'^+çö\n\t\v][';.,üğişçö"", :tr), "üğişiçö üğişçö"
|
8
9
|
end
|
9
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pristine_text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nurettin Onur TUĞCU
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-11-
|
11
|
+
date: 2014-11-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -44,14 +44,28 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '1.4'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '1.4'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: htmlentities
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '4.3'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '4.3'
|
55
69
|
description: This gem uses unicode_utils to lowercase text, removes non-letters, strips
|
56
70
|
and squeezes whitespace, then optionally uses stemwords (from libstemming-tools)
|
57
71
|
to stem every word.
|