pristine_text 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 26c345b29f87af15925158a92b87951a401ccd1e
4
- data.tar.gz: f6bfa2f8ac0574daa87f37eb2f5634a3656686fa
3
+ metadata.gz: aaae999b5981fa51e5d31efeaf45c35e061b3e46
4
+ data.tar.gz: 345716e8f916ae88766fed3f342690ea7f8a18ff
5
5
  SHA512:
6
- metadata.gz: 124f737e2ed1d18e423fae1d12033767d657cb7b9c5281472baf2736e096efc41077b6515087a6498bae0114e861b48216924b365156fdb55d571f821fa01ec4
7
- data.tar.gz: 7d9320dd6e6443b376af26bcc74ef83812ef60f881281216e90358d2dfa301bc97fd90a026004d2028cd13934b96f2ab7a61d2434285bd9c555431fa5d8ac234
6
+ metadata.gz: 22f76bfcc47e5233a13ded3e9c82304c19c8b8656e02fd4e184e37ad0a3c12b95471b8fa3b877202659fe69a75c791e6f56f1019fed5b66dcbebb8cf966d6b7f
7
+ data.tar.gz: 99a2c9034af5efea3c4be578aecab883553350c210c6df78e1d54bbb3e28499c21b7a55dc7b0b1dd306be1640d2c2739f987f62c1283455382005bdefd29a563
data/lib/pristine_text.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  require "pristine_text/version"
2
2
  require "open3"
3
3
  require "unicode_utils"
4
- require "cgi"
4
+ require "htmlentities"
5
5
 
6
6
  module PristineText
7
7
  def self.pipe(text, locale)
@@ -24,7 +24,7 @@ module PristineText
24
24
  end
25
25
 
26
26
  def self.clean(text, locale= :en, stem= true)
27
- text= UnicodeUtils.downcase(CGI.unescapeHTML(text).gsub(/ /, "\n"), locale).
27
+ text= UnicodeUtils.downcase(HTMLEntities.new.decode(text), locale).
28
28
  gsub(/[^\p{Letter}\s]+/, "").
29
29
  strip.squeeze
30
30
  if stem
@@ -1,3 +1,3 @@
1
1
  module PristineText
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
@@ -20,6 +20,7 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency "bundler", "~> 1.7"
22
22
  spec.add_development_dependency "rake", "~> 10.0"
23
- spec.add_dependency 'unicode_utils', '~> 0'
23
+ spec.add_dependency 'unicode_utils', '~> 1.4'
24
+ spec.add_dependency 'htmlentities', '~> 4.3'
24
25
  spec.required_ruby_version= '~> 2.1'
25
26
  end
@@ -1,9 +1,10 @@
1
+ #encoding: UTF-8
1
2
  require "pristine_text"
2
3
  require "minitest/autorun"
3
4
 
4
5
  class PristineTextTest< Minitest::Unit::TestCase
5
6
  def test_clean
6
7
  assert_equal PristineText.clean("haberler geliyorlar gidiyorlar", :tr), "haber geliyor gidiyor"
7
- assert_equal PristineText.clean("}{ÜĞ09&nbsp;İŞi!'^+çö\n\t\v][';.,üğişçö&quot;", :tr), "üğ işiçö üğişçö"
8
+ assert_equal PristineText.clean("}{ÜĞ09&nbsp;İŞi!'^+çö\n\t\v][';.,üğişçö&quot;", :tr), "üğişiçö üğişçö"
8
9
  end
9
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pristine_text
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nurettin Onur TUĞCU
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-11-01 00:00:00.000000000 Z
11
+ date: 2014-11-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -44,14 +44,28 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: '1.4'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
54
+ version: '1.4'
55
+ - !ruby/object:Gem::Dependency
56
+ name: htmlentities
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '4.3'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '4.3'
55
69
  description: This gem uses unicode_utils to lowercase text, removes non-letters, strips
56
70
  and squeezes whitespace, then optionally uses stemwords (from libstemming-tools)
57
71
  to stem every word.