text_parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +20 -0
- data/Rakefile +6 -0
- data/lib/string.rb +5 -0
- data/lib/text_parser.rb +27 -0
- data/lib/text_parser/version.rb +8 -0
- data/test/text_parser_test.rb +66 -0
- data/text_parser.gemspec +7 -0
- metadata +73 -0
data/README.rdoc
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
:title: Text Parser Ruby Gem
|
2
|
+
=Arguments
|
3
|
+
* :dictionary => nil,
|
4
|
+
* :order => :word,
|
5
|
+
* :order_direction => :asc,
|
6
|
+
* :negative_dictionary => []
|
7
|
+
|
8
|
+
=Usage
|
9
|
+
"Simple, simple test".parse # => [{:word => "simple", :hits => 2}, {:word => "test", :hits => 1}]
|
10
|
+
|
11
|
+
my_text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque pretium consectetur."
|
12
|
+
my_text.parse(:dictionary => ["dolor", "consectetur"])
|
13
|
+
# => [{:word => "consectetur", :hits => 2}, {:word => "dolor", :hits => 1}]
|
14
|
+
|
15
|
+
my_text.parse(:dictionary => ["dolor", "consectetur"], :order => :word, :order_direction => :desc)
|
16
|
+
# => [{:word => "dolor", :hits => 1}, {:word => "consectetur", :hits => 2}]
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
|
data/Rakefile
ADDED
data/lib/string.rb
ADDED
data/lib/text_parser.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
module TextParser
|
2
|
+
def parse(args = {})
|
3
|
+
options = {
|
4
|
+
:dictionary => nil,
|
5
|
+
:order => :word,
|
6
|
+
:order_direction => :asc,
|
7
|
+
:negative_dictionary => []
|
8
|
+
}.merge(args)
|
9
|
+
result = []
|
10
|
+
text = process_text
|
11
|
+
options[:dictionary] = text.split(" ") unless options[:dictionary]
|
12
|
+
regex = Regexp.new(options[:dictionary].join("|"), Regexp::IGNORECASE)
|
13
|
+
match_result = text.scan(regex).map{|i| i.downcase}
|
14
|
+
match_result.each do |w|
|
15
|
+
result << {:hits => match_result.count(w), :word => w} unless result.select{|r| r[:word] == w}.shift unless options[:negative_dictionary].map{|i| i.downcase}.include?(w)
|
16
|
+
end
|
17
|
+
result = result.sort_by{|i| i[options[:order]]}
|
18
|
+
result.reverse! if options[:order_direction] == :desc
|
19
|
+
result
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def process_text
|
25
|
+
self.gsub(/[^\w\s\-]/, "")
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require "string"
|
3
|
+
require "text_parser"
|
4
|
+
|
5
|
+
class TextParserTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_should_have_method_parse
|
8
|
+
assert "some text".methods.select{|a| a=~/parse/}.count > 0
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_should_parse
|
12
|
+
text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque pretium consectetur."
|
13
|
+
assert_equal text.parse(:dictionary => ["dolor", "consectetur"]), [{:word => "consectetur", :hits => 2}, {:word => "dolor", :hits => 1}]
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_should_parse_without_dictionary
|
17
|
+
text = "test test"
|
18
|
+
assert_equal text.parse, [{:word => "test", :hits => 2}]
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_should_remove_some_characters
|
22
|
+
text = "Test? Test. Yes, test!"
|
23
|
+
assert_equal text.parse, [{:word => "test", :hits => 3}, {:word => "yes", :hits => 1}]
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_should_return_an_empty_array
|
27
|
+
text = "test"
|
28
|
+
assert_equal text.parse(:dictionary => ['abc']), []
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_should_order_by_word_asc
|
32
|
+
text = " beta omega gamma alpha gamma"
|
33
|
+
result = [{:word => "alpha", :hits => 1},
|
34
|
+
{:word => "beta", :hits => 1},
|
35
|
+
{:word => "gamma", :hits => 2},
|
36
|
+
{:word => "omega", :hits => 1}]
|
37
|
+
assert_equal text.parse, result
|
38
|
+
assert_equal text.parse(:order => :word), result
|
39
|
+
assert_equal text.parse(:order => :word, :order_direction => :asc), result
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_should_order_by_word_desc
|
43
|
+
assert_equal "aaa zzz".parse(:order => :word, :order_direction => :desc), [{:word => "zzz", :hits => 1}, {:word => "aaa", :hits => 1}]
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_should_order_by_hits_asc
|
47
|
+
text = "gamma alpha gamma beta alpha gamma"
|
48
|
+
result = [{:word => "beta", :hits => 1},
|
49
|
+
{:word => "alpha", :hits => 2},
|
50
|
+
{:word => "gamma", :hits => 3}]
|
51
|
+
assert_equal text.parse(:order => :hits), result
|
52
|
+
assert_equal text.parse(:order => :hits, :order_direction => :asc), result
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_should_order_by_hits_desc
|
56
|
+
text = "gamma alpha gamma beta alpha gamma"
|
57
|
+
assert_equal text.parse(:order => :hits, :order_direction => :desc), [{:word => "gamma", :hits => 3},
|
58
|
+
{:word => "alpha", :hits => 2},
|
59
|
+
{:word => "beta", :hits => 1}]
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_should_ignore_negative_dictionary
|
63
|
+
text = "This is good"
|
64
|
+
assert_equal text.parse(:negative_dictionary => ["is", "this"]), [{:word => "good", :hits => 1}]
|
65
|
+
end
|
66
|
+
end
|
data/text_parser.gemspec
ADDED
metadata
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: text_parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Frederico de Paula
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-12-01 00:00:00 -02:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description:
|
23
|
+
email:
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files: []
|
29
|
+
|
30
|
+
files:
|
31
|
+
- lib/string.rb
|
32
|
+
- lib/text_parser/version.rb
|
33
|
+
- lib/text_parser.rb
|
34
|
+
- README.rdoc
|
35
|
+
- test/text_parser_test.rb
|
36
|
+
- Rakefile
|
37
|
+
- text_parser.gemspec
|
38
|
+
has_rdoc: true
|
39
|
+
homepage:
|
40
|
+
licenses: []
|
41
|
+
|
42
|
+
post_install_message:
|
43
|
+
rdoc_options: []
|
44
|
+
|
45
|
+
require_paths:
|
46
|
+
- lib
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
48
|
+
none: false
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
hash: 3
|
53
|
+
segments:
|
54
|
+
- 0
|
55
|
+
version: "0"
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
hash: 3
|
62
|
+
segments:
|
63
|
+
- 0
|
64
|
+
version: "0"
|
65
|
+
requirements: []
|
66
|
+
|
67
|
+
rubyforge_project:
|
68
|
+
rubygems_version: 1.4.2
|
69
|
+
signing_key:
|
70
|
+
specification_version: 3
|
71
|
+
summary: A easy way to parse text.
|
72
|
+
test_files: []
|
73
|
+
|