rhack 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +0 -4
- data/Manifest.txt +0 -1
- data/README.txt +1 -1
- data/Rakefile +2 -2
- data/ext/curb/curb_easy.c +8 -0
- data/lib/curl-global.rb +3 -1
- data/lib/frame.rb +1 -1
- data/lib/init.rb +24 -13
- data/lib/rhack.rb +2 -1
- data/lib/rhack/services.rb +2 -0
- data/lib/scout.rb +20 -20
- metadata +5 -7
- data/License.txt +0 -17
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
data/README.txt
CHANGED
data/Rakefile
CHANGED
@@ -8,13 +8,13 @@ rescue LoadError
|
|
8
8
|
end
|
9
9
|
|
10
10
|
compile_manifest
|
11
|
-
RHACK_VERSION = '0.
|
11
|
+
RHACK_VERSION = '0.3.0'
|
12
12
|
begin
|
13
13
|
require 'hoe'
|
14
14
|
config = Hoe.spec "rhack" do |h|
|
15
15
|
h.developer("Sergey Baev", "tinbka@gmail.com")
|
16
16
|
|
17
|
-
h.description = '
|
17
|
+
h.description = 'Webscraping library based on curb gem extension and libxml-ruby (and optionally Johnson and ActiveRecord)'
|
18
18
|
# h.summary = h.description
|
19
19
|
h.url = 'http://github.com/tinbka'
|
20
20
|
|
data/ext/curb/curb_easy.c
CHANGED
@@ -212,6 +212,12 @@ static void ruby_curl_easy_free(ruby_curl_easy *rbce) {
|
|
212
212
|
if (rbce->curl) {
|
213
213
|
curl_easy_cleanup(rbce->curl);
|
214
214
|
}
|
215
|
+
|
216
|
+
if (rbce->first != NULL) {
|
217
|
+
curl_formfree(rbce->first);
|
218
|
+
rbce->first = NULL;
|
219
|
+
rbce->last = NULL;
|
220
|
+
}
|
215
221
|
}
|
216
222
|
|
217
223
|
void curl_easy_free(ruby_curl_easy *rbce) {
|
@@ -261,6 +267,8 @@ static void ruby_curl_easy_zero(ruby_curl_easy *rbce) {
|
|
261
267
|
rbce->enable_cookies = 0;
|
262
268
|
rbce->ignore_content_length = 0;
|
263
269
|
rbce->callback_active = 0;
|
270
|
+
rbce->first = NULL;
|
271
|
+
rbce->last = NULL;
|
264
272
|
}
|
265
273
|
|
266
274
|
/*
|
data/lib/curl-global.rb
CHANGED
@@ -118,12 +118,14 @@ module Curl
|
|
118
118
|
recall
|
119
119
|
execute
|
120
120
|
end
|
121
|
+
alias :reload :reset
|
121
122
|
|
122
123
|
def reset!
|
123
124
|
recall!
|
124
125
|
execute
|
125
126
|
end
|
126
|
-
|
127
|
+
alias :reload! :reset!
|
128
|
+
module_function :reset!, :reset, :reload!, :reload
|
127
129
|
|
128
130
|
def status(raise_e=true)
|
129
131
|
if $CarierThread and (s = $CarierThread.status)
|
data/lib/frame.rb
CHANGED
@@ -698,7 +698,7 @@ module HTTPAccessKit
|
|
698
698
|
form = "[action=#{@loc.path.inspect}]" if form == :self
|
699
699
|
if form.is String
|
700
700
|
form_node = at form
|
701
|
-
raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node
|
701
|
+
raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
|
702
702
|
else form_node = form
|
703
703
|
end
|
704
704
|
hash = form_node.inputs_all.merge!(hash)
|
data/lib/init.rb
CHANGED
@@ -1,27 +1,38 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module HTTPAccessKit
|
3
3
|
include RMTools
|
4
|
-
|
4
|
+
extend RMTools
|
5
|
+
|
6
|
+
CONFIG = if ENV["APP_ROOT"]
|
7
|
+
YAML.load(read(File.join ENV["APP_ROOT"], 'config/rhack.yml') || '') || {}
|
8
|
+
else
|
9
|
+
YAML.load(read(%W(config/rhack.yml rhack.yml #{File.join(ENV['HOME'], 'rhack.yml')})) || '') || {}
|
10
|
+
end
|
5
11
|
|
6
12
|
UAS = if File.file?(uas = CONFIG['ua file'] || File.join(ENV['HOME'], 'ua.txt'))
|
7
13
|
IO.read(uas)/"\n"
|
8
|
-
else ['Mozilla/5.0 (Windows
|
14
|
+
else ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1'] end
|
9
15
|
|
10
16
|
L = RMLogger.new(CONFIG.logger || {})
|
11
17
|
|
12
|
-
|
18
|
+
db_config = if ENV["APP_ROOT"] and ENV["RAILS_ENV"] and File.file?(dbyml = File.join(ENV["APP_ROOT"], 'config/database.yml'))
|
19
|
+
YAML.load(read(dbyml))[ENV["RAILS_ENV"]]
|
20
|
+
else
|
21
|
+
CONFIG.db || File.join(ENV['HOME'], 'db.yml')
|
22
|
+
end
|
13
23
|
begin
|
14
|
-
|
24
|
+
DB = ActiveRecord::Base.establish_connection_with db_config
|
15
25
|
rescue LoadError
|
16
|
-
|
26
|
+
DB = nil
|
17
27
|
end
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
28
|
+
if DB and !(CONFIG.cache and CONFIG.cache.enabled == false)
|
29
|
+
cache = CONFIG.cache || {}
|
30
|
+
CacheDir = cache.dir || File.join(File.dirname(__FILE__), 'cache')
|
31
|
+
CacheTable = (cache.table || :rhack_cache).to_sym
|
32
|
+
CacheTTL = cache.clean ? eval(cache.clean).b : nil
|
33
|
+
end
|
34
|
+
|
35
|
+
RETRY = CONFIG['scout retry'] || {}
|
25
36
|
|
26
37
|
$uas ||= UAS
|
27
38
|
$Carier ||= Curl::Multi.new
|
@@ -32,5 +43,5 @@ module HTTPAccessKit
|
|
32
43
|
end
|
33
44
|
end
|
34
45
|
|
35
|
-
module Curl;
|
46
|
+
module Curl; extend HTTPAccessKit end
|
36
47
|
RHACK = HTTPAccessKit
|
data/lib/rhack.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
$KCODE = 'UTF-8' if RUBY_VERSION < "1.9"
|
3
3
|
require 'rmtools_dev' unless defined? RMTools
|
4
|
+
requrie 'cgi'
|
4
5
|
here = File.expand_path File.dirname __FILE__
|
5
6
|
require File.join(here, 'curb_core.so')
|
6
7
|
require 'active_record'
|
@@ -13,4 +14,4 @@ require "#{here}/curl-global"
|
|
13
14
|
require "#{here}/scout"
|
14
15
|
require "#{here}/frame"
|
15
16
|
require "#{here}/words"
|
16
|
-
require "#{here}/cache" if RHACK::
|
17
|
+
require "#{here}/cache" if defined? RHACK::CacheDir
|
data/lib/rhack/services.rb
CHANGED
data/lib/scout.rb
CHANGED
@@ -67,14 +67,14 @@ module Curl
|
|
67
67
|
end
|
68
68
|
|
69
69
|
@req = {}
|
70
|
-
@req.url
|
71
|
-
@req.
|
70
|
+
@req.url = easy.last_effective_url
|
71
|
+
@req.headers = easy.headers
|
72
72
|
if range = easy.headers.Range and range[/(\d+)-(\d+)/]
|
73
|
-
@req.range
|
73
|
+
@req.range = $1.to_i .. $2.to_i
|
74
74
|
end
|
75
75
|
if easy.base and @req.meth = easy.base.last_method and @req.meth == :post
|
76
|
-
@req.body
|
77
|
-
@req.mp
|
76
|
+
@req.body = easy.post_body
|
77
|
+
@req.mp = easy.multipart_form_post?
|
78
78
|
end
|
79
79
|
end
|
80
80
|
|
@@ -90,6 +90,7 @@ module Curl
|
|
90
90
|
@error ? @error[key_or_index] : @hash[key_or_index.downcase]
|
91
91
|
end
|
92
92
|
|
93
|
+
alias :headers :hash
|
93
94
|
end
|
94
95
|
|
95
96
|
end
|
@@ -101,25 +102,20 @@ module HTTPAccessKit
|
|
101
102
|
|
102
103
|
def initialize(*args)
|
103
104
|
if args[1].is Scout
|
104
|
-
|
105
|
+
str, scout = *args
|
105
106
|
ck = str//;\s*/
|
106
107
|
ck[1..-1].each {|par|
|
107
108
|
a = par/'='
|
108
|
-
case a[0]
|
109
|
+
case a[0].downcase
|
109
110
|
when 'path'; @path = (a[1] == '/') ? // : /^#{Regexp.escape a[1]}/
|
110
|
-
when 'domain'
|
111
|
-
|
112
|
-
domain = a[1][1..-1]
|
113
|
-
@domain = /(^|\.)#{Regexp.escape(domain)}$/
|
114
|
-
else
|
115
|
-
domain = a[1]
|
116
|
-
@domain = /^#{Regexp.escape(a[1])}$/
|
117
|
-
end
|
111
|
+
when 'domain'; @domain = /(^|\.)#{Regexp.escape a[1].sub(/^./, '')}$/
|
112
|
+
when 'expires'; @expires = a[1].to_time
|
118
113
|
end
|
119
114
|
}
|
120
|
-
@name, @value = ck[0]
|
121
|
-
|
122
|
-
|
115
|
+
@name, @value = ck[0].split('=', 2)
|
116
|
+
#@value.gsub!(/^['"]|['"]$/, '')
|
117
|
+
L.debug args if !@domain
|
118
|
+
(scout.cookies[scout.uri.host] ||= {})[@name] = self
|
123
119
|
else
|
124
120
|
@name, cookie = args[0]
|
125
121
|
case cookie
|
@@ -134,7 +130,11 @@ module HTTPAccessKit
|
|
134
130
|
end
|
135
131
|
|
136
132
|
def use(str, uri)
|
137
|
-
|
133
|
+
if !@expires or @expires > Time.now
|
134
|
+
str << @string if uri.path[@path] and !uri.root || uri.host[@domain]
|
135
|
+
else
|
136
|
+
:expired
|
137
|
+
end
|
138
138
|
end
|
139
139
|
|
140
140
|
def to_s; @value end
|
@@ -296,7 +296,7 @@ module HTTPAccessKit
|
|
296
296
|
header = DefaultHeader.dup
|
297
297
|
if @cookieProc
|
298
298
|
cookies = ''
|
299
|
-
main_cks.
|
299
|
+
main_cks.each {|k, v| main_cks.delete k if v.use(cookies, @uri) == :expired}
|
300
300
|
header['Cookie'] = cookies[0..-3]
|
301
301
|
end
|
302
302
|
if @refforge
|
metadata
CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 3
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Sergey Baev
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-08-
|
18
|
+
date: 2012-08-02 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: rmtools
|
@@ -80,7 +80,7 @@ dependencies:
|
|
80
80
|
version: "2.12"
|
81
81
|
type: :development
|
82
82
|
version_requirements: *id004
|
83
|
-
description:
|
83
|
+
description: Webscraping library based on curb gem extension and libxml-ruby (and optionally Johnson and ActiveRecord)
|
84
84
|
email:
|
85
85
|
- tinbka@gmail.com
|
86
86
|
executables: []
|
@@ -89,7 +89,6 @@ extensions:
|
|
89
89
|
- ext/curb/extconf.rb
|
90
90
|
extra_rdoc_files:
|
91
91
|
- ./Manifest.txt
|
92
|
-
- ./License.txt
|
93
92
|
- ./README.txt
|
94
93
|
- ./History.txt
|
95
94
|
files:
|
@@ -150,7 +149,6 @@ files:
|
|
150
149
|
- ./Rakefile
|
151
150
|
- ./Manifest.txt
|
152
151
|
- ./CURB-LICENSE
|
153
|
-
- ./License.txt
|
154
152
|
- ./README.txt
|
155
153
|
- ./Gemfile
|
156
154
|
- ./History.txt
|
data/License.txt
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
Copyright (c) 2010 Anonymous <tinbka@gmail.com>
|
2
|
-
All rights reserved.
|
3
|
-
|
4
|
-
This work is licensed under the same license as Ruby language.
|
5
|
-
|
6
|
-
|
7
|
-
THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
8
|
-
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
9
|
-
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
10
|
-
ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
11
|
-
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
12
|
-
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
13
|
-
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
14
|
-
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
15
|
-
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
16
|
-
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
17
|
-
SUCH DAMAGE.
|