rhack 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +0 -4
- data/Manifest.txt +0 -1
- data/README.txt +1 -1
- data/Rakefile +2 -2
- data/ext/curb/curb_easy.c +8 -0
- data/lib/curl-global.rb +3 -1
- data/lib/frame.rb +1 -1
- data/lib/init.rb +24 -13
- data/lib/rhack.rb +2 -1
- data/lib/rhack/services.rb +2 -0
- data/lib/scout.rb +20 -20
- metadata +5 -7
- data/License.txt +0 -17
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
data/README.txt
CHANGED
data/Rakefile
CHANGED
@@ -8,13 +8,13 @@ rescue LoadError
|
|
8
8
|
end
|
9
9
|
|
10
10
|
compile_manifest
|
11
|
-
RHACK_VERSION = '0.
|
11
|
+
RHACK_VERSION = '0.3.0'
|
12
12
|
begin
|
13
13
|
require 'hoe'
|
14
14
|
config = Hoe.spec "rhack" do |h|
|
15
15
|
h.developer("Sergey Baev", "tinbka@gmail.com")
|
16
16
|
|
17
|
-
h.description = '
|
17
|
+
h.description = 'Webscraping library based on curb gem extension and libxml-ruby (and optionally Johnson and ActiveRecord)'
|
18
18
|
# h.summary = h.description
|
19
19
|
h.url = 'http://github.com/tinbka'
|
20
20
|
|
data/ext/curb/curb_easy.c
CHANGED
@@ -212,6 +212,12 @@ static void ruby_curl_easy_free(ruby_curl_easy *rbce) {
|
|
212
212
|
if (rbce->curl) {
|
213
213
|
curl_easy_cleanup(rbce->curl);
|
214
214
|
}
|
215
|
+
|
216
|
+
if (rbce->first != NULL) {
|
217
|
+
curl_formfree(rbce->first);
|
218
|
+
rbce->first = NULL;
|
219
|
+
rbce->last = NULL;
|
220
|
+
}
|
215
221
|
}
|
216
222
|
|
217
223
|
void curl_easy_free(ruby_curl_easy *rbce) {
|
@@ -261,6 +267,8 @@ static void ruby_curl_easy_zero(ruby_curl_easy *rbce) {
|
|
261
267
|
rbce->enable_cookies = 0;
|
262
268
|
rbce->ignore_content_length = 0;
|
263
269
|
rbce->callback_active = 0;
|
270
|
+
rbce->first = NULL;
|
271
|
+
rbce->last = NULL;
|
264
272
|
}
|
265
273
|
|
266
274
|
/*
|
data/lib/curl-global.rb
CHANGED
@@ -118,12 +118,14 @@ module Curl
|
|
118
118
|
recall
|
119
119
|
execute
|
120
120
|
end
|
121
|
+
alias :reload :reset
|
121
122
|
|
122
123
|
def reset!
|
123
124
|
recall!
|
124
125
|
execute
|
125
126
|
end
|
126
|
-
|
127
|
+
alias :reload! :reset!
|
128
|
+
module_function :reset!, :reset, :reload!, :reload
|
127
129
|
|
128
130
|
def status(raise_e=true)
|
129
131
|
if $CarierThread and (s = $CarierThread.status)
|
data/lib/frame.rb
CHANGED
@@ -698,7 +698,7 @@ module HTTPAccessKit
|
|
698
698
|
form = "[action=#{@loc.path.inspect}]" if form == :self
|
699
699
|
if form.is String
|
700
700
|
form_node = at form
|
701
|
-
raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node
|
701
|
+
raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
|
702
702
|
else form_node = form
|
703
703
|
end
|
704
704
|
hash = form_node.inputs_all.merge!(hash)
|
data/lib/init.rb
CHANGED
@@ -1,27 +1,38 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module HTTPAccessKit
|
3
3
|
include RMTools
|
4
|
-
|
4
|
+
extend RMTools
|
5
|
+
|
6
|
+
CONFIG = if ENV["APP_ROOT"]
|
7
|
+
YAML.load(read(File.join ENV["APP_ROOT"], 'config/rhack.yml') || '') || {}
|
8
|
+
else
|
9
|
+
YAML.load(read(%W(config/rhack.yml rhack.yml #{File.join(ENV['HOME'], 'rhack.yml')})) || '') || {}
|
10
|
+
end
|
5
11
|
|
6
12
|
UAS = if File.file?(uas = CONFIG['ua file'] || File.join(ENV['HOME'], 'ua.txt'))
|
7
13
|
IO.read(uas)/"\n"
|
8
|
-
else ['Mozilla/5.0 (Windows
|
14
|
+
else ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1'] end
|
9
15
|
|
10
16
|
L = RMLogger.new(CONFIG.logger || {})
|
11
17
|
|
12
|
-
|
18
|
+
db_config = if ENV["APP_ROOT"] and ENV["RAILS_ENV"] and File.file?(dbyml = File.join(ENV["APP_ROOT"], 'config/database.yml'))
|
19
|
+
YAML.load(read(dbyml))[ENV["RAILS_ENV"]]
|
20
|
+
else
|
21
|
+
CONFIG.db || File.join(ENV['HOME'], 'db.yml')
|
22
|
+
end
|
13
23
|
begin
|
14
|
-
|
24
|
+
DB = ActiveRecord::Base.establish_connection_with db_config
|
15
25
|
rescue LoadError
|
16
|
-
|
26
|
+
DB = nil
|
17
27
|
end
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
28
|
+
if DB and !(CONFIG.cache and CONFIG.cache.enabled == false)
|
29
|
+
cache = CONFIG.cache || {}
|
30
|
+
CacheDir = cache.dir || File.join(File.dirname(__FILE__), 'cache')
|
31
|
+
CacheTable = (cache.table || :rhack_cache).to_sym
|
32
|
+
CacheTTL = cache.clean ? eval(cache.clean).b : nil
|
33
|
+
end
|
34
|
+
|
35
|
+
RETRY = CONFIG['scout retry'] || {}
|
25
36
|
|
26
37
|
$uas ||= UAS
|
27
38
|
$Carier ||= Curl::Multi.new
|
@@ -32,5 +43,5 @@ module HTTPAccessKit
|
|
32
43
|
end
|
33
44
|
end
|
34
45
|
|
35
|
-
module Curl;
|
46
|
+
module Curl; extend HTTPAccessKit end
|
36
47
|
RHACK = HTTPAccessKit
|
data/lib/rhack.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
$KCODE = 'UTF-8' if RUBY_VERSION < "1.9"
|
3
3
|
require 'rmtools_dev' unless defined? RMTools
|
4
|
+
requrie 'cgi'
|
4
5
|
here = File.expand_path File.dirname __FILE__
|
5
6
|
require File.join(here, 'curb_core.so')
|
6
7
|
require 'active_record'
|
@@ -13,4 +14,4 @@ require "#{here}/curl-global"
|
|
13
14
|
require "#{here}/scout"
|
14
15
|
require "#{here}/frame"
|
15
16
|
require "#{here}/words"
|
16
|
-
require "#{here}/cache" if RHACK::
|
17
|
+
require "#{here}/cache" if defined? RHACK::CacheDir
|
data/lib/rhack/services.rb
CHANGED
data/lib/scout.rb
CHANGED
@@ -67,14 +67,14 @@ module Curl
|
|
67
67
|
end
|
68
68
|
|
69
69
|
@req = {}
|
70
|
-
@req.url
|
71
|
-
@req.
|
70
|
+
@req.url = easy.last_effective_url
|
71
|
+
@req.headers = easy.headers
|
72
72
|
if range = easy.headers.Range and range[/(\d+)-(\d+)/]
|
73
|
-
@req.range
|
73
|
+
@req.range = $1.to_i .. $2.to_i
|
74
74
|
end
|
75
75
|
if easy.base and @req.meth = easy.base.last_method and @req.meth == :post
|
76
|
-
@req.body
|
77
|
-
@req.mp
|
76
|
+
@req.body = easy.post_body
|
77
|
+
@req.mp = easy.multipart_form_post?
|
78
78
|
end
|
79
79
|
end
|
80
80
|
|
@@ -90,6 +90,7 @@ module Curl
|
|
90
90
|
@error ? @error[key_or_index] : @hash[key_or_index.downcase]
|
91
91
|
end
|
92
92
|
|
93
|
+
alias :headers :hash
|
93
94
|
end
|
94
95
|
|
95
96
|
end
|
@@ -101,25 +102,20 @@ module HTTPAccessKit
|
|
101
102
|
|
102
103
|
def initialize(*args)
|
103
104
|
if args[1].is Scout
|
104
|
-
|
105
|
+
str, scout = *args
|
105
106
|
ck = str//;\s*/
|
106
107
|
ck[1..-1].each {|par|
|
107
108
|
a = par/'='
|
108
|
-
case a[0]
|
109
|
+
case a[0].downcase
|
109
110
|
when 'path'; @path = (a[1] == '/') ? // : /^#{Regexp.escape a[1]}/
|
110
|
-
when 'domain'
|
111
|
-
|
112
|
-
domain = a[1][1..-1]
|
113
|
-
@domain = /(^|\.)#{Regexp.escape(domain)}$/
|
114
|
-
else
|
115
|
-
domain = a[1]
|
116
|
-
@domain = /^#{Regexp.escape(a[1])}$/
|
117
|
-
end
|
111
|
+
when 'domain'; @domain = /(^|\.)#{Regexp.escape a[1].sub(/^./, '')}$/
|
112
|
+
when 'expires'; @expires = a[1].to_time
|
118
113
|
end
|
119
114
|
}
|
120
|
-
@name, @value = ck[0]
|
121
|
-
|
122
|
-
|
115
|
+
@name, @value = ck[0].split('=', 2)
|
116
|
+
#@value.gsub!(/^['"]|['"]$/, '')
|
117
|
+
L.debug args if !@domain
|
118
|
+
(scout.cookies[scout.uri.host] ||= {})[@name] = self
|
123
119
|
else
|
124
120
|
@name, cookie = args[0]
|
125
121
|
case cookie
|
@@ -134,7 +130,11 @@ module HTTPAccessKit
|
|
134
130
|
end
|
135
131
|
|
136
132
|
def use(str, uri)
|
137
|
-
|
133
|
+
if !@expires or @expires > Time.now
|
134
|
+
str << @string if uri.path[@path] and !uri.root || uri.host[@domain]
|
135
|
+
else
|
136
|
+
:expired
|
137
|
+
end
|
138
138
|
end
|
139
139
|
|
140
140
|
def to_s; @value end
|
@@ -296,7 +296,7 @@ module HTTPAccessKit
|
|
296
296
|
header = DefaultHeader.dup
|
297
297
|
if @cookieProc
|
298
298
|
cookies = ''
|
299
|
-
main_cks.
|
299
|
+
main_cks.each {|k, v| main_cks.delete k if v.use(cookies, @uri) == :expired}
|
300
300
|
header['Cookie'] = cookies[0..-3]
|
301
301
|
end
|
302
302
|
if @refforge
|
metadata
CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 3
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Sergey Baev
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-08-
|
18
|
+
date: 2012-08-02 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: rmtools
|
@@ -80,7 +80,7 @@ dependencies:
|
|
80
80
|
version: "2.12"
|
81
81
|
type: :development
|
82
82
|
version_requirements: *id004
|
83
|
-
description:
|
83
|
+
description: Webscraping library based on curb gem extension and libxml-ruby (and optionally Johnson and ActiveRecord)
|
84
84
|
email:
|
85
85
|
- tinbka@gmail.com
|
86
86
|
executables: []
|
@@ -89,7 +89,6 @@ extensions:
|
|
89
89
|
- ext/curb/extconf.rb
|
90
90
|
extra_rdoc_files:
|
91
91
|
- ./Manifest.txt
|
92
|
-
- ./License.txt
|
93
92
|
- ./README.txt
|
94
93
|
- ./History.txt
|
95
94
|
files:
|
@@ -150,7 +149,6 @@ files:
|
|
150
149
|
- ./Rakefile
|
151
150
|
- ./Manifest.txt
|
152
151
|
- ./CURB-LICENSE
|
153
|
-
- ./License.txt
|
154
152
|
- ./README.txt
|
155
153
|
- ./Gemfile
|
156
154
|
- ./History.txt
|
data/License.txt
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
Copyright (c) 2010 Anonymous <tinbka@gmail.com>
|
2
|
-
All rights reserved.
|
3
|
-
|
4
|
-
This work is licensed under the same license as Ruby language.
|
5
|
-
|
6
|
-
|
7
|
-
THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
8
|
-
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
9
|
-
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
10
|
-
ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
11
|
-
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
12
|
-
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
13
|
-
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
14
|
-
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
15
|
-
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
16
|
-
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
17
|
-
SUCH DAMAGE.
|