rhack 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/CURB-LICENSE +51 -0
- data/Gemfile +4 -0
- data/History.txt +4 -0
- data/LICENSE +51 -0
- data/License.txt +17 -0
- data/Manifest.txt +61 -0
- data/README.txt +12 -0
- data/Rakefile +34 -0
- data/ext/curb-original/curb.c +977 -0
- data/ext/curb-original/curb.h +52 -0
- data/ext/curb-original/curb_config.h +235 -0
- data/ext/curb-original/curb_easy.c +3455 -0
- data/ext/curb-original/curb_easy.h +90 -0
- data/ext/curb-original/curb_errors.c +647 -0
- data/ext/curb-original/curb_errors.h +129 -0
- data/ext/curb-original/curb_macros.h +159 -0
- data/ext/curb-original/curb_multi.c +704 -0
- data/ext/curb-original/curb_multi.h +26 -0
- data/ext/curb-original/curb_postfield.c +523 -0
- data/ext/curb-original/curb_postfield.h +40 -0
- data/ext/curb-original/curb_upload.c +80 -0
- data/ext/curb-original/curb_upload.h +30 -0
- data/ext/curb/Makefile +157 -0
- data/ext/curb/curb.c +977 -0
- data/ext/curb/curb.h +52 -0
- data/ext/curb/curb_config.h +235 -0
- data/ext/curb/curb_easy.c +3430 -0
- data/ext/curb/curb_easy.h +94 -0
- data/ext/curb/curb_errors.c +647 -0
- data/ext/curb/curb_errors.h +129 -0
- data/ext/curb/curb_macros.h +159 -0
- data/ext/curb/curb_multi.c +710 -0
- data/ext/curb/curb_multi.h +26 -0
- data/ext/curb/curb_postfield.c +523 -0
- data/ext/curb/curb_postfield.h +40 -0
- data/ext/curb/curb_upload.c +80 -0
- data/ext/curb/curb_upload.h +30 -0
- data/ext/curb/extconf.rb +399 -0
- data/lib/cache.rb +44 -0
- data/lib/curl-global.rb +151 -0
- data/lib/extensions/browser/env.js +697 -0
- data/lib/extensions/browser/jquery.js +7180 -0
- data/lib/extensions/browser/xmlsax.js +1564 -0
- data/lib/extensions/browser/xmlw3cdom_1.js +1444 -0
- data/lib/extensions/browser/xmlw3cdom_2.js +2744 -0
- data/lib/extensions/curb.rb +125 -0
- data/lib/extensions/declarative.rb +153 -0
- data/lib/extensions/johnson.rb +63 -0
- data/lib/frame.rb +766 -0
- data/lib/init.rb +36 -0
- data/lib/rhack.rb +16 -0
- data/lib/rhack.yml.template +19 -0
- data/lib/rhack/proxy/checker.rb +226 -0
- data/lib/rhack/proxy/list.rb +196 -0
- data/lib/rhack/services.rb +445 -0
- data/lib/rhack_in.rb +2 -0
- data/lib/scout.rb +591 -0
- data/lib/words.rb +37 -0
- data/test/test_frame.rb +107 -0
- data/test/test_rhack.rb +5 -0
- data/test/test_scout.rb +53 -0
- metadata +195 -0
data/lib/words.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
if !defined? RuDict and !defined? String::RuDict
|
3
|
+
|
4
|
+
if d = RHACK::CONFIG['rudict']
|
5
|
+
if File.file? d and (d = YAML.load(read d)).is Hash
|
6
|
+
String::RuDict = d
|
7
|
+
end
|
8
|
+
elsif RHACK::DB
|
9
|
+
class RuDictionary < ActiveRecord::Base
|
10
|
+
declare :rudictionary, :id => false do |t|
|
11
|
+
t.string :word
|
12
|
+
t.string :form0
|
13
|
+
t.string :form1
|
14
|
+
t.string :form2
|
15
|
+
end
|
16
|
+
String::RuDict = Hash[all.map{|w| [w.word, [w.form0, w.form1, w.form2]]}]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
class String
|
23
|
+
RuDict = {} if !defined? RuDict
|
24
|
+
|
25
|
+
def x(int)
|
26
|
+
"#{int} #{if cyr?
|
27
|
+
if forms = RuDict[self]
|
28
|
+
mod = int%10
|
29
|
+
forms[mod == 1 ? 0 : int.in(2..4) ? 1 : 2]
|
30
|
+
else self end
|
31
|
+
else
|
32
|
+
mod = int%10
|
33
|
+
mod == 1 ? self : pluralize
|
34
|
+
end}"
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
data/test/test_frame.rb
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
class TC_Frame < Test::Unit::TestCase
|
2
|
+
include HTTPAccessKit
|
3
|
+
|
4
|
+
def test_init
|
5
|
+
f = Frame 10
|
6
|
+
assert_equal 10, f.ss.size
|
7
|
+
assert !f.static
|
8
|
+
f = Frame "example.com", :ck=>{"key"=>"value"}, :timeout=>10
|
9
|
+
assert_equal 20, f.ss.size
|
10
|
+
assert_equal "http://example.com", f.loc.root
|
11
|
+
assert_instance_of Scout, f.ss.rand
|
12
|
+
assert_equal 'value', f.ss.next.main_cks.values.to_s
|
13
|
+
assert_equal 10, f.ss.next.timeout
|
14
|
+
assert f.static
|
15
|
+
assert_raise(ArgumentError) {Frame "example.com", 0}
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
class TC_StaticInterpreter < Test::Unit::TestCase
|
21
|
+
include HTTPAccessKit
|
22
|
+
|
23
|
+
def setup
|
24
|
+
@f = Frame("http://site.org/index.html", 1)
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_target_fail
|
28
|
+
assert_raise(TargetError) {@f.interpret_request("http://example.com")}
|
29
|
+
assert_raise(TargetError) {@f.interpret_request({}, "http://example.com")}
|
30
|
+
assert_raise(TargetError) {@f.interpret_request({}, true, ["http://example.com", "http://site.org/index.html"])}
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_simple
|
34
|
+
assert_equal [nil, [:loadGet, "http://site.org/index.html"], nil, {:eval=>true, :a=>:b}],
|
35
|
+
@f.interpret_request(:a=>:b)
|
36
|
+
assert_equal [nil, [:loadGet, "http://site.org/"], nil, {:eval=>nil}],
|
37
|
+
@f.interpret_request("http://site.org/", :eval=>nil)
|
38
|
+
assert_equal [true, nil, [[:loadGet, "http://site.org/page_1"], [:loadGet, "http://site.org/page_2"]], {:eval=>true, :wait=>1, :headers=>{'Referer'=>'localhost'}}],
|
39
|
+
@f.interpret_request((1..2).map{|i|"http://site.org/page_#{i}"}, :wait=>1, :headers=>{'Referer'=>'localhost'})
|
40
|
+
assert_equal [true, nil, [[:loadGet, "http://site.org/page_1"]], {:eval=>true}],
|
41
|
+
@f.interpret_request(["page_1"])
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_zip
|
45
|
+
_1x1 = [true, nil, [[:loadPost, {:a=>:b}, false, "http://site.org/page_3"]], {:eval=>true}]
|
46
|
+
assert_equal _1x1, @f.interpret_request([{:a=>:b}], false, ["page_3"])
|
47
|
+
assert_equal _1x1, @f.interpret_request([{:a=>:b}], false, ["page_3"], :zip=>1)
|
48
|
+
|
49
|
+
assert_equal [true, nil, [[:loadPost, {:a=>:b}, false, "http://site.org/page_3"], [:loadPost, {:c=>:d}, false, "http://site.org/page_4"]], {:eval=>true}],
|
50
|
+
@f.interpret_request([{:a=>:b}, {:c=>:d}], :def, ["page_3", "page_4"], :zip=>true)
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_zip_fail
|
54
|
+
assert_raise(ZippingError) {@f.interpret_request({:a=>:b, :_1=>:_2}, false, "page_3", :zip=>1)}
|
55
|
+
assert_raise(ZippingError) {@f.interpret_request([{:a=>:b}], false, "page_3", :zip=>0)}
|
56
|
+
assert_raise(ZippingError) {@f.interpret_request([{:a=>:b}, {:_1=>:_2}], false, ["page_3"], :zip=>1)}
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_quad
|
60
|
+
_2x2 = [true, nil, [[:loadPost, {:a=>:b}, false, "http://site.org/page_3"], [:loadPost, {:a=>:b}, false, "http://site.org/page_4"], [:loadPost, {:c=>:d}, false, "http://site.org/page_3"], [:loadPost, {:c=>:d}, false, "http://site.org/page_4"]], {:eval=>true}]
|
61
|
+
assert_equal _2x2, @f.interpret_request([{:a=>:b},{:c=>:d}], :def, ["page_3", "page_4"], :zip=>false)
|
62
|
+
assert_equal _2x2, @f.interpret_request([{:a=>:b},{:c=>:d}], ["page_3", "page_4"])
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_implicit
|
66
|
+
assert_equal [true, nil, [[:loadPost, {:a=>:b}, false, "http://site.org/index.html"]], {:eval=>true}],
|
67
|
+
@f.interpret_request([:a=>:b])
|
68
|
+
assert_equal [nil, [:loadGet, "http://site.org/index.html"], nil, {:eval=>true}],
|
69
|
+
@f.interpret_request
|
70
|
+
assert_equal [nil, [:loadPost, {:a=>:b, :_1=>:_2}, false, "http://site.org/"], nil, {:eval=>true}],
|
71
|
+
@f.interpret_request({:a=>:b, :_1=>:_2}, "/")
|
72
|
+
assert_equal [nil, [:loadPost, {:a=>:b, :_1=>:_2}, true, "http://site.org/page_3"], nil, {:eval=>true}],
|
73
|
+
@f.interpret_request({:a=>:b, :_1=>:_2}, "/", "page_3")
|
74
|
+
assert_equal [true, nil, [[:loadGet, "http://site.org/page_1"], [:loadGet, "http://site.org/page_2"]], {:eval=>true}],
|
75
|
+
@f.interpret_request(['page_1', 'page_2'], true, "/")
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_params_fail
|
79
|
+
assert_raise(TypeError) {@f.interpret_request("/", [])}
|
80
|
+
assert_raise(TypeError) {@f.interpret_request([], "/")}
|
81
|
+
assert_raise(TypeError) {@f.interpret_request("/", "")}
|
82
|
+
assert_raise(TypeError) {@f.interpret_request([], "/", :a=>:b)}
|
83
|
+
assert_raise(TypeError) {@f.interpret_request([], true, "/")}
|
84
|
+
assert_raise(ArgumentError) {@f.interpret_request({:a=>:b}, [])}
|
85
|
+
assert_raise(ArgumentError) {@f.interpret_request({:a=>:b}, true, [])}
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
class TC_DynamicInterpreter < Test::Unit::TestCase
|
91
|
+
include HTTPAccessKit
|
92
|
+
|
93
|
+
def setup
|
94
|
+
@f = Frame()
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_target_fail
|
98
|
+
assert_raise(TargetError) {@f.interpret_request}
|
99
|
+
assert_raise(TargetError) {@f.interpret_request([{}], "./")}
|
100
|
+
assert_raise(TargetError) {@f.interpret_request("example.com")}
|
101
|
+
assert_raise(TargetError) {@f.interpret_request({}, true, ["http://example.com", "site.org/index.html"])}
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
|
107
|
+
|
data/test/test_rhack.rb
ADDED
data/test/test_scout.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
class TC_Scout < Test::Unit::TestCase
|
2
|
+
include HTTPAccessKit
|
3
|
+
|
4
|
+
def setup
|
5
|
+
Curl.run
|
6
|
+
end
|
7
|
+
|
8
|
+
def teardown
|
9
|
+
Curl.stop
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_init
|
13
|
+
s = {}
|
14
|
+
assert_nothing_raised {
|
15
|
+
s = Scout 'http://example.com', ['127.0.0.1', 8000], :def, false
|
16
|
+
}
|
17
|
+
assert_equal s.ua, :rand
|
18
|
+
assert_equal s.proxystr, '127.0.0.1:8000'
|
19
|
+
assert_nil s.webproxy
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_load
|
23
|
+
res = nil
|
24
|
+
s = Scout 'api.rubyonrails.org', :raise=>true
|
25
|
+
s.loadGet('/') {|c| res = c.res}
|
26
|
+
Curl.wait
|
27
|
+
assert_equal res.code, 200
|
28
|
+
assert_equal s.http.response_code, 200
|
29
|
+
s.loadGet 'http://example.com/aaaaaa'
|
30
|
+
assert_equal res.code, 200
|
31
|
+
Curl.wait
|
32
|
+
assert_equal res.code, 302
|
33
|
+
assert_equal s.http.response_code, 302
|
34
|
+
s.loadGet 'https://developer.mozilla.org/en'
|
35
|
+
Curl.wait
|
36
|
+
s.loadGet('./CSS') {|c| res = nil}
|
37
|
+
assert_equal res.code, 200
|
38
|
+
s.cp_on
|
39
|
+
Curl.wait
|
40
|
+
assert_nil res
|
41
|
+
assert_equal s.res.req.header.Referer, "https://developer.mozilla.org/CSS"
|
42
|
+
s.refforge = false
|
43
|
+
s.loadGet {|c| res = c.res.req.url}
|
44
|
+
Curl.wait
|
45
|
+
assert_equal res, "https://developer.mozilla.org/en/CSS"
|
46
|
+
assert_nil s.res.req.header.Referer
|
47
|
+
assert_not_empty s.main_cks
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_fail
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
metadata
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rhack
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 19
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
- 2
|
10
|
+
version: 0.2.2
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Sergey Baev
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2012-08-01 00:00:00 Z
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: rmtools
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 23
|
29
|
+
segments:
|
30
|
+
- 1
|
31
|
+
- 0
|
32
|
+
- 0
|
33
|
+
version: 1.0.0
|
34
|
+
type: :runtime
|
35
|
+
version_requirements: *id001
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rake
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
hash: 49
|
45
|
+
segments:
|
46
|
+
- 0
|
47
|
+
- 8
|
48
|
+
- 7
|
49
|
+
version: 0.8.7
|
50
|
+
type: :runtime
|
51
|
+
version_requirements: *id002
|
52
|
+
- !ruby/object:Gem::Dependency
|
53
|
+
name: libxml-ruby
|
54
|
+
prerelease: false
|
55
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
hash: 21
|
61
|
+
segments:
|
62
|
+
- 1
|
63
|
+
- 1
|
64
|
+
- 3
|
65
|
+
version: 1.1.3
|
66
|
+
type: :runtime
|
67
|
+
version_requirements: *id003
|
68
|
+
- !ruby/object:Gem::Dependency
|
69
|
+
name: hoe
|
70
|
+
prerelease: false
|
71
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ~>
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
hash: 27
|
77
|
+
segments:
|
78
|
+
- 2
|
79
|
+
- 12
|
80
|
+
version: "2.12"
|
81
|
+
type: :development
|
82
|
+
version_requirements: *id004
|
83
|
+
description: Webscrapping library based on curb gem extension and libxml-ruby (and optionally Johnson and ActiveRecord)
|
84
|
+
email:
|
85
|
+
- tinbka@gmail.com
|
86
|
+
executables: []
|
87
|
+
|
88
|
+
extensions:
|
89
|
+
- ext/curb/extconf.rb
|
90
|
+
extra_rdoc_files:
|
91
|
+
- ./Manifest.txt
|
92
|
+
- ./License.txt
|
93
|
+
- ./README.txt
|
94
|
+
- ./History.txt
|
95
|
+
files:
|
96
|
+
- ext/curb/curb_errors.c
|
97
|
+
- ext/curb/curb_errors.h
|
98
|
+
- ext/curb/Makefile
|
99
|
+
- ext/curb/curb_macros.h
|
100
|
+
- ext/curb/curb_multi.c
|
101
|
+
- ext/curb/curb_multi.h
|
102
|
+
- ext/curb/curb_upload.c
|
103
|
+
- ext/curb/curb_upload.h
|
104
|
+
- ext/curb/curb_config.h
|
105
|
+
- ext/curb/extconf.rb
|
106
|
+
- ext/curb/curb.c
|
107
|
+
- ext/curb/curb.h
|
108
|
+
- ext/curb/curb_easy.c
|
109
|
+
- ext/curb/curb_easy.h
|
110
|
+
- ext/curb/curb_postfield.c
|
111
|
+
- ext/curb/curb_postfield.h
|
112
|
+
- ext/curb-original/curb_errors.c
|
113
|
+
- ext/curb-original/curb_errors.h
|
114
|
+
- ext/curb-original/curb_macros.h
|
115
|
+
- ext/curb-original/curb_multi.c
|
116
|
+
- ext/curb-original/curb_multi.h
|
117
|
+
- ext/curb-original/curb_upload.c
|
118
|
+
- ext/curb-original/curb_upload.h
|
119
|
+
- ext/curb-original/curb_config.h
|
120
|
+
- ext/curb-original/curb.c
|
121
|
+
- ext/curb-original/curb.h
|
122
|
+
- ext/curb-original/curb_easy.c
|
123
|
+
- ext/curb-original/curb_easy.h
|
124
|
+
- ext/curb-original/curb_postfield.c
|
125
|
+
- ext/curb-original/curb_postfield.h
|
126
|
+
- lib/rhack/proxy/list.rb
|
127
|
+
- lib/rhack/proxy/checker.rb
|
128
|
+
- lib/rhack/services.rb
|
129
|
+
- lib/cache.rb
|
130
|
+
- lib/rhack.rb
|
131
|
+
- lib/scout.rb
|
132
|
+
- lib/rhack.yml.template
|
133
|
+
- lib/frame.rb
|
134
|
+
- lib/words.rb
|
135
|
+
- lib/curl-global.rb
|
136
|
+
- lib/extensions/curb.rb
|
137
|
+
- lib/extensions/declarative.rb
|
138
|
+
- lib/extensions/johnson.rb
|
139
|
+
- lib/extensions/browser/env.js
|
140
|
+
- lib/extensions/browser/jquery.js
|
141
|
+
- lib/extensions/browser/xmlw3cdom_1.js
|
142
|
+
- lib/extensions/browser/xmlw3cdom_2.js
|
143
|
+
- lib/extensions/browser/xmlsax.js
|
144
|
+
- lib/rhack_in.rb
|
145
|
+
- lib/init.rb
|
146
|
+
- test/test_rhack.rb
|
147
|
+
- test/test_scout.rb
|
148
|
+
- test/test_frame.rb
|
149
|
+
- ./LICENSE
|
150
|
+
- ./Rakefile
|
151
|
+
- ./Manifest.txt
|
152
|
+
- ./CURB-LICENSE
|
153
|
+
- ./License.txt
|
154
|
+
- ./README.txt
|
155
|
+
- ./Gemfile
|
156
|
+
- ./History.txt
|
157
|
+
- .gemtest
|
158
|
+
homepage: http://github.com/tinbka
|
159
|
+
licenses: []
|
160
|
+
|
161
|
+
post_install_message:
|
162
|
+
rdoc_options:
|
163
|
+
- --main
|
164
|
+
- README.txt
|
165
|
+
require_paths:
|
166
|
+
- lib
|
167
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
168
|
+
none: false
|
169
|
+
requirements:
|
170
|
+
- - ">="
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
hash: 3
|
173
|
+
segments:
|
174
|
+
- 0
|
175
|
+
version: "0"
|
176
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
177
|
+
none: false
|
178
|
+
requirements:
|
179
|
+
- - ">="
|
180
|
+
- !ruby/object:Gem::Version
|
181
|
+
hash: 3
|
182
|
+
segments:
|
183
|
+
- 0
|
184
|
+
version: "0"
|
185
|
+
requirements: []
|
186
|
+
|
187
|
+
rubyforge_project: rhack
|
188
|
+
rubygems_version: 1.8.17
|
189
|
+
signing_key:
|
190
|
+
specification_version: 3
|
191
|
+
summary: ""
|
192
|
+
test_files:
|
193
|
+
- test/test_rhack.rb
|
194
|
+
- test/test_scout.rb
|
195
|
+
- test/test_frame.rb
|