rhack 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +0,0 @@
1
- === Version 1.0.0
2
-
3
- * Divided by classes and packed as gem
4
-
@@ -55,7 +55,6 @@ test/test_frame.rb
55
55
  ./Rakefile
56
56
  ./Manifest.txt
57
57
  ./CURB-LICENSE
58
- ./License.txt
59
58
  ./README.txt
60
59
  ./Gemfile
61
60
  ./History.txt
data/README.txt CHANGED
@@ -1,7 +1,7 @@
1
1
  ## RHACK
2
2
 
3
3
  Rhack is Ruby Http ACcess Kit -- curl-based web-client for developing web-clients
4
- applications such as webscrappers.
4
+ applications.
5
5
 
6
6
  ## License
7
7
 
data/Rakefile CHANGED
@@ -8,13 +8,13 @@ rescue LoadError
8
8
  end
9
9
 
10
10
  compile_manifest
11
- RHACK_VERSION = '0.2.2'
11
+ RHACK_VERSION = '0.3.0'
12
12
  begin
13
13
  require 'hoe'
14
14
  config = Hoe.spec "rhack" do |h|
15
15
  h.developer("Sergey Baev", "tinbka@gmail.com")
16
16
 
17
- h.description = 'Webscrapping library based on curb gem extension and libxml-ruby (and optionally Johnson and ActiveRecord)'
17
+ h.description = 'Webscraping library based on curb gem extension and libxml-ruby (and optionally Johnson and ActiveRecord)'
18
18
  # h.summary = h.description
19
19
  h.url = 'http://github.com/tinbka'
20
20
 
@@ -212,6 +212,12 @@ static void ruby_curl_easy_free(ruby_curl_easy *rbce) {
212
212
  if (rbce->curl) {
213
213
  curl_easy_cleanup(rbce->curl);
214
214
  }
215
+
216
+ if (rbce->first != NULL) {
217
+ curl_formfree(rbce->first);
218
+ rbce->first = NULL;
219
+ rbce->last = NULL;
220
+ }
215
221
  }
216
222
 
217
223
  void curl_easy_free(ruby_curl_easy *rbce) {
@@ -261,6 +267,8 @@ static void ruby_curl_easy_zero(ruby_curl_easy *rbce) {
261
267
  rbce->enable_cookies = 0;
262
268
  rbce->ignore_content_length = 0;
263
269
  rbce->callback_active = 0;
270
+ rbce->first = NULL;
271
+ rbce->last = NULL;
264
272
  }
265
273
 
266
274
  /*
@@ -118,12 +118,14 @@ module Curl
118
118
  recall
119
119
  execute
120
120
  end
121
+ alias :reload :reset
121
122
 
122
123
  def reset!
123
124
  recall!
124
125
  execute
125
126
  end
126
- module_function :reset!, :reset
127
+ alias :reload! :reset!
128
+ module_function :reset!, :reset, :reload!, :reload
127
129
 
128
130
  def status(raise_e=true)
129
131
  if $CarierThread and (s = $CarierThread.status)
@@ -698,7 +698,7 @@ module HTTPAccessKit
698
698
  form = "[action=#{@loc.path.inspect}]" if form == :self
699
699
  if form.is String
700
700
  form_node = at form
701
- raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node
701
+ raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
702
702
  else form_node = form
703
703
  end
704
704
  hash = form_node.inputs_all.merge!(hash)
@@ -1,27 +1,38 @@
1
1
  # encoding: utf-8
2
2
  module HTTPAccessKit
3
3
  include RMTools
4
- CONFIG = YAML.load(read(%W(rhack.yml #{File.join(ENV['HOME'], 'rhack.yml')})) || '') || {}
4
+ extend RMTools
5
+
6
+ CONFIG = if ENV["APP_ROOT"]
7
+ YAML.load(read(File.join ENV["APP_ROOT"], 'config/rhack.yml') || '') || {}
8
+ else
9
+ YAML.load(read(%W(config/rhack.yml rhack.yml #{File.join(ENV['HOME'], 'rhack.yml')})) || '') || {}
10
+ end
5
11
 
6
12
  UAS = if File.file?(uas = CONFIG['ua file'] || File.join(ENV['HOME'], 'ua.txt'))
7
13
  IO.read(uas)/"\n"
8
- else ['Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9.1.8) Gecko/20100202 MRA 5.6 (build 03278) Firefox/3.5.8 (.NET CLR 3.5.30729)'] end
14
+ else ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1'] end
9
15
 
10
16
  L = RMLogger.new(CONFIG.logger || {})
11
17
 
12
- config = CONFIG.db || File.join(ENV['HOME'], 'db.yml')
18
+ db_config = if ENV["APP_ROOT"] and ENV["RAILS_ENV"] and File.file?(dbyml = File.join(ENV["APP_ROOT"], 'config/database.yml'))
19
+ YAML.load(read(dbyml))[ENV["RAILS_ENV"]]
20
+ else
21
+ CONFIG.db || File.join(ENV['HOME'], 'db.yml')
22
+ end
13
23
  begin
14
- DB = ActiveRecord::Base.establish_connection_with config
24
+ DB = ActiveRecord::Base.establish_connection_with db_config
15
25
  rescue LoadError
16
- DB = nil
26
+ DB = nil
17
27
  end
18
-
19
- cache = CONFIG.cache || {}
20
- CacheDir = cache.dir || File.join(File.dirname(__FILE__), 'cache')
21
- CacheTable = (cache.table || :rhack_cache).to_sym
22
- CacheTTL = cache.clean ? eval(cache.clean).b : nil
23
-
24
- RETRY = CONFIG['scout retry'] || {}
28
+ if DB and !(CONFIG.cache and CONFIG.cache.enabled == false)
29
+ cache = CONFIG.cache || {}
30
+ CacheDir = cache.dir || File.join(File.dirname(__FILE__), 'cache')
31
+ CacheTable = (cache.table || :rhack_cache).to_sym
32
+ CacheTTL = cache.clean ? eval(cache.clean).b : nil
33
+ end
34
+
35
+ RETRY = CONFIG['scout retry'] || {}
25
36
 
26
37
  $uas ||= UAS
27
38
  $Carier ||= Curl::Multi.new
@@ -32,5 +43,5 @@ module HTTPAccessKit
32
43
  end
33
44
  end
34
45
 
35
- module Curl; include HTTPAccessKit end
46
+ module Curl; extend HTTPAccessKit end
36
47
  RHACK = HTTPAccessKit
@@ -1,6 +1,7 @@
1
1
  # encoding: utf-8
2
2
  $KCODE = 'UTF-8' if RUBY_VERSION < "1.9"
3
3
  require 'rmtools_dev' unless defined? RMTools
4
+ requrie 'cgi'
4
5
  here = File.expand_path File.dirname __FILE__
5
6
  require File.join(here, 'curb_core.so')
6
7
  require 'active_record'
@@ -13,4 +14,4 @@ require "#{here}/curl-global"
13
14
  require "#{here}/scout"
14
15
  require "#{here}/frame"
15
16
  require "#{here}/words"
16
- require "#{here}/cache" if RHACK::DB
17
+ require "#{here}/cache" if defined? RHACK::CacheDir
@@ -1,6 +1,8 @@
1
1
  # encoding: utf-8
2
2
  require 'rhack'
3
3
 
4
+ # Вызовы сервисов всегда ждут и возвращают обработанный ответ, если вызвваны без блока.
5
+ # В противном случае используется событийная модель и обработанный ответ передаётся в блок.
4
6
  module HTTPAccessKit
5
7
 
6
8
  class Service
@@ -67,14 +67,14 @@ module Curl
67
67
  end
68
68
 
69
69
  @req = {}
70
- @req.url = easy.last_effective_url
71
- @req.header = easy.headers
70
+ @req.url = easy.last_effective_url
71
+ @req.headers = easy.headers
72
72
  if range = easy.headers.Range and range[/(\d+)-(\d+)/]
73
- @req.range = $1.to_i .. $2.to_i
73
+ @req.range = $1.to_i .. $2.to_i
74
74
  end
75
75
  if easy.base and @req.meth = easy.base.last_method and @req.meth == :post
76
- @req.body = easy.post_body
77
- @req.mp = easy.multipart_form_post?
76
+ @req.body = easy.post_body
77
+ @req.mp = easy.multipart_form_post?
78
78
  end
79
79
  end
80
80
 
@@ -90,6 +90,7 @@ module Curl
90
90
  @error ? @error[key_or_index] : @hash[key_or_index.downcase]
91
91
  end
92
92
 
93
+ alias :headers :hash
93
94
  end
94
95
 
95
96
  end
@@ -101,25 +102,20 @@ module HTTPAccessKit
101
102
 
102
103
  def initialize(*args)
103
104
  if args[1].is Scout
104
- domain, str, scout = nil, *args
105
+ str, scout = *args
105
106
  ck = str//;\s*/
106
107
  ck[1..-1].each {|par|
107
108
  a = par/'='
108
- case a[0]
109
+ case a[0].downcase
109
110
  when 'path'; @path = (a[1] == '/') ? // : /^#{Regexp.escape a[1]}/
110
- when 'domain'
111
- if a[1].ord == ?.
112
- domain = a[1][1..-1]
113
- @domain = /(^|\.)#{Regexp.escape(domain)}$/
114
- else
115
- domain = a[1]
116
- @domain = /^#{Regexp.escape(a[1])}$/
117
- end
111
+ when 'domain'; @domain = /(^|\.)#{Regexp.escape a[1].sub(/^./, '')}$/
112
+ when 'expires'; @expires = a[1].to_time
118
113
  end
119
114
  }
120
- @name, @value = ck[0]/'='
121
- L.debug args if !domain
122
- (scout.cookies[domain || scout.uri.host] ||= {})[@name] = self
115
+ @name, @value = ck[0].split('=', 2)
116
+ #@value.gsub!(/^['"]|['"]$/, '')
117
+ L.debug args if !@domain
118
+ (scout.cookies[scout.uri.host] ||= {})[@name] = self
123
119
  else
124
120
  @name, cookie = args[0]
125
121
  case cookie
@@ -134,7 +130,11 @@ module HTTPAccessKit
134
130
  end
135
131
 
136
132
  def use(str, uri)
137
- str << @string if uri.path[@path] and !uri.root || uri.host[@domain]
133
+ if !@expires or @expires > Time.now
134
+ str << @string if uri.path[@path] and !uri.root || uri.host[@domain]
135
+ else
136
+ :expired
137
+ end
138
138
  end
139
139
 
140
140
  def to_s; @value end
@@ -296,7 +296,7 @@ module HTTPAccessKit
296
296
  header = DefaultHeader.dup
297
297
  if @cookieProc
298
298
  cookies = ''
299
- main_cks.each_value {|v| v.use cookies, @uri}
299
+ main_cks.each {|k, v| main_cks.delete k if v.use(cookies, @uri) == :expired}
300
300
  header['Cookie'] = cookies[0..-3]
301
301
  end
302
302
  if @refforge
metadata CHANGED
@@ -5,9 +5,9 @@ version: !ruby/object:Gem::Version
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 2
9
- - 2
10
- version: 0.2.2
8
+ - 3
9
+ - 0
10
+ version: 0.3.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Sergey Baev
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-08-01 00:00:00 Z
18
+ date: 2012-08-02 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: rmtools
@@ -80,7 +80,7 @@ dependencies:
80
80
  version: "2.12"
81
81
  type: :development
82
82
  version_requirements: *id004
83
- description: Webscrapping library based on curb gem extension and libxml-ruby (and optionally Johnson and ActiveRecord)
83
+ description: Webscraping library based on curb gem extension and libxml-ruby (and optionally Johnson and ActiveRecord)
84
84
  email:
85
85
  - tinbka@gmail.com
86
86
  executables: []
@@ -89,7 +89,6 @@ extensions:
89
89
  - ext/curb/extconf.rb
90
90
  extra_rdoc_files:
91
91
  - ./Manifest.txt
92
- - ./License.txt
93
92
  - ./README.txt
94
93
  - ./History.txt
95
94
  files:
@@ -150,7 +149,6 @@ files:
150
149
  - ./Rakefile
151
150
  - ./Manifest.txt
152
151
  - ./CURB-LICENSE
153
- - ./License.txt
154
152
  - ./README.txt
155
153
  - ./Gemfile
156
154
  - ./History.txt
@@ -1,17 +0,0 @@
1
- Copyright (c) 2010 Anonymous <tinbka@gmail.com>
2
- All rights reserved.
3
-
4
- This work is licensed under the same license as Ruby language.
5
-
6
-
7
- THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
8
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
9
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
10
- ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
11
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
12
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
13
- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
14
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
15
- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
16
- OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
17
- SUCH DAMAGE.