text_parser 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +20 -0
- data/Rakefile +6 -0
- data/lib/string.rb +5 -0
- data/lib/text_parser.rb +27 -0
- data/lib/text_parser/version.rb +8 -0
- data/test/text_parser_test.rb +66 -0
- data/text_parser.gemspec +7 -0
- metadata +73 -0
data/README.rdoc
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
:title: Text Parser Ruby Gem
|
2
|
+
=Arguments
|
3
|
+
* :dictionary => nil,
|
4
|
+
* :order => :word,
|
5
|
+
* :order_direction => :asc,
|
6
|
+
* :negative_dictionary => []
|
7
|
+
|
8
|
+
=Usage
|
9
|
+
"Simple, simple test".parse # => [{:word => "simple", :hits => 2}, {:word => "test", :hits => 1}]
|
10
|
+
|
11
|
+
my_text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque pretium consectetur."
|
12
|
+
my_text.parse(:dictionary => ["dolor", "consectetur"])
|
13
|
+
# => [{:word => "consectetur", :hits => 2}, {:word => "dolor", :hits => 1}]
|
14
|
+
|
15
|
+
my_text.parse(:dictionary => ["dolor", "consectetur"], :order => :word, :order_direction => :desc)
|
16
|
+
# => [{:word => "dolor", :hits => 1}, {:word => "consectetur", :hits => 2}]
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
|
data/Rakefile
ADDED
data/lib/string.rb
ADDED
data/lib/text_parser.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
module TextParser
|
2
|
+
def parse(args = {})
|
3
|
+
options = {
|
4
|
+
:dictionary => nil,
|
5
|
+
:order => :word,
|
6
|
+
:order_direction => :asc,
|
7
|
+
:negative_dictionary => []
|
8
|
+
}.merge(args)
|
9
|
+
result = []
|
10
|
+
text = process_text
|
11
|
+
options[:dictionary] = text.split(" ") unless options[:dictionary]
|
12
|
+
regex = Regexp.new(options[:dictionary].join("|"), Regexp::IGNORECASE)
|
13
|
+
match_result = text.scan(regex).map{|i| i.downcase}
|
14
|
+
match_result.each do |w|
|
15
|
+
result << {:hits => match_result.count(w), :word => w} unless result.select{|r| r[:word] == w}.shift unless options[:negative_dictionary].map{|i| i.downcase}.include?(w)
|
16
|
+
end
|
17
|
+
result = result.sort_by{|i| i[options[:order]]}
|
18
|
+
result.reverse! if options[:order_direction] == :desc
|
19
|
+
result
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def process_text
|
25
|
+
self.gsub(/[^\w\s\-]/, "")
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require "string"
|
3
|
+
require "text_parser"
|
4
|
+
|
5
|
+
class TextParserTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_should_have_method_parse
|
8
|
+
assert "some text".methods.select{|a| a=~/parse/}.count > 0
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_should_parse
|
12
|
+
text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque pretium consectetur."
|
13
|
+
assert_equal text.parse(:dictionary => ["dolor", "consectetur"]), [{:word => "consectetur", :hits => 2}, {:word => "dolor", :hits => 1}]
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_should_parse_without_dictionary
|
17
|
+
text = "test test"
|
18
|
+
assert_equal text.parse, [{:word => "test", :hits => 2}]
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_should_remove_some_characters
|
22
|
+
text = "Test? Test. Yes, test!"
|
23
|
+
assert_equal text.parse, [{:word => "test", :hits => 3}, {:word => "yes", :hits => 1}]
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_should_return_an_empty_array
|
27
|
+
text = "test"
|
28
|
+
assert_equal text.parse(:dictionary => ['abc']), []
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_should_order_by_word_asc
|
32
|
+
text = " beta omega gamma alpha gamma"
|
33
|
+
result = [{:word => "alpha", :hits => 1},
|
34
|
+
{:word => "beta", :hits => 1},
|
35
|
+
{:word => "gamma", :hits => 2},
|
36
|
+
{:word => "omega", :hits => 1}]
|
37
|
+
assert_equal text.parse, result
|
38
|
+
assert_equal text.parse(:order => :word), result
|
39
|
+
assert_equal text.parse(:order => :word, :order_direction => :asc), result
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_should_order_by_word_desc
|
43
|
+
assert_equal "aaa zzz".parse(:order => :word, :order_direction => :desc), [{:word => "zzz", :hits => 1}, {:word => "aaa", :hits => 1}]
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_should_order_by_hits_asc
|
47
|
+
text = "gamma alpha gamma beta alpha gamma"
|
48
|
+
result = [{:word => "beta", :hits => 1},
|
49
|
+
{:word => "alpha", :hits => 2},
|
50
|
+
{:word => "gamma", :hits => 3}]
|
51
|
+
assert_equal text.parse(:order => :hits), result
|
52
|
+
assert_equal text.parse(:order => :hits, :order_direction => :asc), result
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_should_order_by_hits_desc
|
56
|
+
text = "gamma alpha gamma beta alpha gamma"
|
57
|
+
assert_equal text.parse(:order => :hits, :order_direction => :desc), [{:word => "gamma", :hits => 3},
|
58
|
+
{:word => "alpha", :hits => 2},
|
59
|
+
{:word => "beta", :hits => 1}]
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_should_ignore_negative_dictionary
|
63
|
+
text = "This is good"
|
64
|
+
assert_equal text.parse(:negative_dictionary => ["is", "this"]), [{:word => "good", :hits => 1}]
|
65
|
+
end
|
66
|
+
end
|
data/text_parser.gemspec
ADDED
metadata
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: text_parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Frederico de Paula
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-12-01 00:00:00 -02:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description:
|
23
|
+
email:
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files: []
|
29
|
+
|
30
|
+
files:
|
31
|
+
- lib/string.rb
|
32
|
+
- lib/text_parser/version.rb
|
33
|
+
- lib/text_parser.rb
|
34
|
+
- README.rdoc
|
35
|
+
- test/text_parser_test.rb
|
36
|
+
- Rakefile
|
37
|
+
- text_parser.gemspec
|
38
|
+
has_rdoc: true
|
39
|
+
homepage:
|
40
|
+
licenses: []
|
41
|
+
|
42
|
+
post_install_message:
|
43
|
+
rdoc_options: []
|
44
|
+
|
45
|
+
require_paths:
|
46
|
+
- lib
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
48
|
+
none: false
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
hash: 3
|
53
|
+
segments:
|
54
|
+
- 0
|
55
|
+
version: "0"
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
hash: 3
|
62
|
+
segments:
|
63
|
+
- 0
|
64
|
+
version: "0"
|
65
|
+
requirements: []
|
66
|
+
|
67
|
+
rubyforge_project:
|
68
|
+
rubygems_version: 1.4.2
|
69
|
+
signing_key:
|
70
|
+
specification_version: 3
|
71
|
+
summary: A easy way to parse text.
|
72
|
+
test_files: []
|
73
|
+
|