pristine_text 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 03ce1c942c7e32460cf0c312a5d897a205f4a2a7
4
- data.tar.gz: 67916011e5695da1ec0c521e5f693215f3c82238
3
+ metadata.gz: 26c345b29f87af15925158a92b87951a401ccd1e
4
+ data.tar.gz: f6bfa2f8ac0574daa87f37eb2f5634a3656686fa
5
5
  SHA512:
6
- metadata.gz: c43004bf30849ab031a6c314a52bad0b863c1591304c082e5761dccc55f0bf658e9ea855a6a46e03c6e35412355c62347c494c2b4f8f25ec30a21a2dce10d830
7
- data.tar.gz: 43a0bbcb713ab367ccd531448a82a101de22af9f85fe6fdd5acf2556918533cb647ad575f2af790139cdfe89a62f1408486f124d675e236e0129ae886522d015
6
+ metadata.gz: 124f737e2ed1d18e423fae1d12033767d657cb7b9c5281472baf2736e096efc41077b6515087a6498bae0114e861b48216924b365156fdb55d571f821fa01ec4
7
+ data.tar.gz: 7d9320dd6e6443b376af26bcc74ef83812ef60f881281216e90358d2dfa301bc97fd90a026004d2028cd13934b96f2ab7a61d2434285bd9c555431fa5d8ac234
data/.gitignore CHANGED
@@ -12,3 +12,4 @@
12
12
  *.o
13
13
  *.a
14
14
  mkmf.log
15
+ *.gem
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # PristineText
2
2
 
3
- This gem uses unicode_utils to lowercase text, removes non-letters, strips and squeezes whitespace, then optionally stemwords (from stemming-tools) to stem every word.
3
+ This gem uses unicode_utils to lowercase text, removes non-letters, strips and squeezes whitespace, then optionally uses stemwords (from libstemming-tools) to stem every word.
4
4
 
5
5
  ## Installation
6
6
 
@@ -20,9 +20,9 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
- require "pristine_text"
23
+ require "pristine_text"
24
24
 
25
- puts PristineText.clean("haberler geliyorlar gidiyorlar", :tr)
25
+ puts PristineText.clean("haberler geliyorlar gidiyorlar", :tr)
26
26
 
27
27
  ## Contributing
28
28
 
@@ -1,3 +1,3 @@
1
1
  module PristineText
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/pristine_text.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require "pristine_text/version"
2
2
  require "open3"
3
3
  require "unicode_utils"
4
+ require "cgi"
4
5
 
5
6
  module PristineText
6
7
  def self.pipe(text, locale)
@@ -23,7 +24,7 @@ module PristineText
23
24
  end
24
25
 
25
26
  def self.clean(text, locale= :en, stem= true)
26
- text= UnicodeUtils.downcase(text, locale).
27
+ text= UnicodeUtils.downcase(CGI.unescapeHTML(text).gsub(/ /, "\n"), locale).
27
28
  gsub(/[^\p{Letter}\s]+/, "").
28
29
  strip.squeeze
29
30
  if stem
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ["Nurettin Onur TUĞCU"]
10
10
  spec.email = ["onurtugcu@gmail.com"]
11
11
  spec.summary = %q{Lowercase, squeeze, stem text.}
12
- spec.description = %q{This gem uses unicode_utils to lowercase text, removes non-letters, strips and squeezes whitespace, then optionally stemwords (from stemming-tools) to stem every word.}
12
+ spec.description = %q{This gem uses unicode_utils to lowercase text, removes non-letters, strips and squeezes whitespace, then optionally uses stemwords (from libstemming-tools) to stem every word.}
13
13
  spec.homepage = "https://github.com/nurettin/pristine_text"
14
14
  spec.license = "MIT"
15
15
 
@@ -0,0 +1,9 @@
1
+ require "pristine_text"
2
+ require "minitest/autorun"
3
+
4
+ class PristineTextTest< Minitest::Unit::TestCase
5
+ def test_clean
6
+ assert_equal PristineText.clean("haberler geliyorlar gidiyorlar", :tr), "haber geliyor gidiyor"
7
+ assert_equal PristineText.clean("}{ÜĞ09&nbsp;İŞi!'^+çö\n\t\v][';.,üğişçö&quot;", :tr), "üğ işiçö üğişçö"
8
+ end
9
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pristine_text
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nurettin Onur TUĞCU
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-31 00:00:00.000000000 Z
11
+ date: 2014-11-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -53,8 +53,8 @@ dependencies:
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  description: This gem uses unicode_utils to lowercase text, removes non-letters, strips
56
- and squeezes whitespace, then optionally stemwords (from stemming-tools) to stem
57
- every word.
56
+ and squeezes whitespace, then optionally uses stemwords (from libstemming-tools)
57
+ to stem every word.
58
58
  email:
59
59
  - onurtugcu@gmail.com
60
60
  executables: []
@@ -69,6 +69,7 @@ files:
69
69
  - lib/pristine_text.rb
70
70
  - lib/pristine_text/version.rb
71
71
  - pristine_text.gemspec
72
+ - test/pristine_text.rb
72
73
  homepage: https://github.com/nurettin/pristine_text
73
74
  licenses:
74
75
  - MIT
@@ -89,8 +90,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
89
90
  version: '0'
90
91
  requirements: []
91
92
  rubyforge_project:
92
- rubygems_version: 2.2.2
93
+ rubygems_version: 2.4.2
93
94
  signing_key:
94
95
  specification_version: 4
95
96
  summary: Lowercase, squeeze, stem text.
96
- test_files: []
97
+ test_files:
98
+ - test/pristine_text.rb