text_parser 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/text_parser/version.rb +1 -1
- data/lib/text_parser.rb +6 -4
- data/test/text_parser_test.rb +25 -2
- data/text_parser.gemspec +1 -1
- metadata +4 -4
data/lib/text_parser/version.rb
CHANGED
data/lib/text_parser.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'iconv'
|
1
2
|
module TextParser
|
2
3
|
# Returns a parsed text with the words and its occurrences.
|
3
4
|
# @param [Hash] [args]
|
@@ -14,10 +15,10 @@ module TextParser
|
|
14
15
|
text = process_text
|
15
16
|
options[:dictionary] = text.split(" ") unless options[:dictionary]
|
16
17
|
return [] if options[:dictionary].count < 1
|
17
|
-
regex = Regexp.new(options[:dictionary].join("
|
18
|
-
match_result = text.scan(regex).map{|i| i.downcase}
|
18
|
+
regex = Regexp.new("(\\b#{options[:dictionary].join('\\b|\\b')}\\b)", Regexp::IGNORECASE)
|
19
|
+
match_result = text.scan(regex).map{|i| i.shift.downcase}
|
19
20
|
match_result.each do |w|
|
20
|
-
result << {:hits => match_result.count(w), :word => w} unless result.select{|r| r[:word] == w}.shift
|
21
|
+
result << {:hits => match_result.count(w), :word => w} unless result.select{|r| r[:word] == w}.shift || options[:negative_dictionary].map{|i| i.downcase}.include?(w)
|
21
22
|
end
|
22
23
|
result = result.sort_by{|i| i[options[:order]]}
|
23
24
|
result.reverse! if options[:order_direction] == :desc
|
@@ -27,7 +28,8 @@ module TextParser
|
|
27
28
|
private
|
28
29
|
|
29
30
|
def process_text
|
30
|
-
self.gsub(
|
31
|
+
text = self.gsub(/\s{2,}/," ")
|
32
|
+
text = text.gsub(/[^\w\s\-]/, "")
|
31
33
|
end
|
32
34
|
end
|
33
35
|
|
data/test/text_parser_test.rb
CHANGED
@@ -2,7 +2,6 @@ require "test/unit"
|
|
2
2
|
require "text_parser"
|
3
3
|
|
4
4
|
class TextParserTest < Test::Unit::TestCase
|
5
|
-
|
6
5
|
def test_should_have_method_parse
|
7
6
|
assert "some text".methods.select{|a| a == "parse"}.count > 0
|
8
7
|
end
|
@@ -79,4 +78,28 @@ class TextParserTest < Test::Unit::TestCase
|
|
79
78
|
assert_equal "text".parse(args), [{:word => "text", :hits => 1}]
|
80
79
|
end
|
81
80
|
|
82
|
-
|
81
|
+
def test_should_work_with_many_spaces
|
82
|
+
text = "e se eu encher de espacos"
|
83
|
+
assert_equal [{:word => "de", :hits => 1},
|
84
|
+
{:word => "e", :hits => 1},
|
85
|
+
{:word => "encher", :hits => 1},
|
86
|
+
{:word => "espacos", :hits => 1},
|
87
|
+
{:word => "eu", :hits => 1},
|
88
|
+
{:word => "se", :hits => 1}], text.parse
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
|
98
|
+
|
99
|
+
|
100
|
+
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
|
105
|
+
|
data/text_parser.gemspec
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 4
|
10
|
+
version: 0.1.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Frederico de Paula
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-12-
|
18
|
+
date: 2011-12-08 00:00:00 -02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|