web_crawler 0.2.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +3 -0
- data/lib/web_crawler/batch_request.rb +1 -0
- data/lib/web_crawler/cli.rb +4 -0
- data/lib/web_crawler/response.rb +37 -1
- data/lib/web_crawler/version.rb +2 -2
- data/lib/web_crawler/view/runner.rb +57 -4
- data/lib/web_crawler/view/yaml.rb +9 -0
- data/lib/web_crawler/view.rb +21 -3
- data/web_crawler.gemspec +2 -1
- metadata +26 -9
data/Gemfile
CHANGED
data/lib/web_crawler/cli.rb
CHANGED
@@ -14,13 +14,16 @@ module WebCrawler
|
|
14
14
|
|
15
15
|
class_option :format, type: :string, desc: "output format [json, xml, csv]", default: 'plain'
|
16
16
|
class_option :json, type: :boolean, desc: "json output format. shortcut for --format json"
|
17
|
+
class_option :yaml, type: :boolean, desc: "yaml output format. shortcut for --format yaml"
|
17
18
|
class_option :xml, type: :boolean, desc: "xml output format. shortcut for --format xml"
|
18
19
|
class_option :csv, type: :boolean, desc: "csv output format. shortcut for --format csv"
|
19
20
|
class_option :table, type: :boolean, desc: "table output format. shortcut for --format table"
|
20
21
|
class_option :cached, type: :boolean, desc: "use cached requests. if ./tmp/cache exists use it for cache files"
|
21
22
|
class_option :follow, type: :boolean, desc: "follow to urls on the pages"
|
22
23
|
class_option :run, type: :string, desc: "run custom script with api access"
|
24
|
+
class_option :console, type: :boolean, desc: "run irb console after execution"
|
23
25
|
class_option :log, type: :string, desc: "log file path"
|
26
|
+
class_option :output, type: :string, desc: "output file path"
|
24
27
|
|
25
28
|
before_action except: :help do
|
26
29
|
@options = options.dup
|
@@ -29,6 +32,7 @@ module WebCrawler
|
|
29
32
|
@options[:format] = 'csv' if options[:csv]
|
30
33
|
@options[:format] = 'table' if options[:table]
|
31
34
|
@options[:format] = 'plain' if options[:plain]
|
35
|
+
@options[:format] = 'yaml' if options[:yaml]
|
32
36
|
|
33
37
|
@options[:original_format] = @options[:format] if options[:run]
|
34
38
|
@options[:format] = 'runner' if options[:run]
|
data/lib/web_crawler/response.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
+
require 'mime/types'
|
2
|
+
|
1
3
|
module WebCrawler
|
2
4
|
class Response
|
3
5
|
extend ::Forwardable
|
4
6
|
|
5
|
-
delegate [:
|
7
|
+
delegate [:http_version, :code, :message, :msg, :code_type, :[], :redirect_path, :redirect?] => '@response'
|
6
8
|
|
7
9
|
attr_reader :url, :expire, :date, :cached
|
8
10
|
|
@@ -13,6 +15,14 @@ module WebCrawler
|
|
13
15
|
@expire ||= Time.parse(self['Expires']) rescue Time.now
|
14
16
|
end
|
15
17
|
|
18
|
+
[:xml, :html, :json].each do |type|
|
19
|
+
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
20
|
+
def #{type}?
|
21
|
+
mime_type.sub_type == '#{type}'
|
22
|
+
end
|
23
|
+
RUBY
|
24
|
+
end
|
25
|
+
|
16
26
|
def set_cached_flag
|
17
27
|
@cached = ' CACHED'
|
18
28
|
end
|
@@ -36,8 +46,34 @@ module WebCrawler
|
|
36
46
|
"#{redirected}>"
|
37
47
|
end
|
38
48
|
|
49
|
+
def mime_type
|
50
|
+
MIME::Types[header['content-type']].first
|
51
|
+
end
|
52
|
+
|
53
|
+
def header
|
54
|
+
@header ||= Hash[@response.to_hash.map(&:flatten)]
|
55
|
+
end
|
56
|
+
|
57
|
+
def body
|
58
|
+
type, encoding = self['Content-Type'].split("=")
|
59
|
+
@body ||= if encoding.upcase == 'UTF-8'
|
60
|
+
@response.body
|
61
|
+
else
|
62
|
+
encode_body(encoding.upcase)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
39
66
|
alias :to_s :body
|
40
67
|
|
68
|
+
def encode_body(from)
|
69
|
+
require "iconv" unless defined?(Iconv)
|
70
|
+
encoded = Iconv.iconv('UTF-8', from, @response.body).first
|
71
|
+
if xml?
|
72
|
+
encoded = encoded.gsub(/<\?xml version="(.*?)" encoding=".*?"\?>/, "<?xml version=\"1.0\" encoding=\"utf-8\"?>")
|
73
|
+
end
|
74
|
+
encoded
|
75
|
+
end
|
76
|
+
|
41
77
|
def type
|
42
78
|
@response.class
|
43
79
|
end
|
data/lib/web_crawler/version.rb
CHANGED
@@ -3,9 +3,22 @@ require "fileutils"
|
|
3
3
|
module WebCrawler::View
|
4
4
|
class Runner < Base
|
5
5
|
|
6
|
-
|
7
|
-
|
6
|
+
class WorkSpace
|
7
|
+
# array of responses
|
8
8
|
attr_accessor :responses
|
9
|
+
attr_accessor :results
|
10
|
+
|
11
|
+
def q
|
12
|
+
exit
|
13
|
+
end
|
14
|
+
|
15
|
+
def returning(value)
|
16
|
+
self.results = value
|
17
|
+
end
|
18
|
+
|
19
|
+
def method_missing(meth, *args, &block)
|
20
|
+
puts "\e[31m\e[1mError: method \"\e[0m\e[31m#{meth}\e[0m\e[31m\e[1m\" is missing\e[0m"
|
21
|
+
end
|
9
22
|
end
|
10
23
|
|
11
24
|
def render
|
@@ -13,8 +26,48 @@ module WebCrawler::View
|
|
13
26
|
@options['run'] = File.expand_path @options['run'], FileUtils.pwd
|
14
27
|
end
|
15
28
|
|
16
|
-
|
17
|
-
|
29
|
+
@work_space = WorkSpace.new
|
30
|
+
@work_space.responses = input.freeze
|
31
|
+
@work_space.results = eval(File.open(@options['run'], 'r').read, @work_space.instance_eval("binding"), @options['run'])
|
32
|
+
|
33
|
+
load_console! if @options['console']
|
34
|
+
|
35
|
+
WebCrawler::View.factory(@options['original_format'], @work_space.results, @options).render
|
36
|
+
end
|
37
|
+
|
38
|
+
def load_console!
|
39
|
+
require "irb"
|
40
|
+
IRB.init_config nil
|
41
|
+
IRB.instance_exec do
|
42
|
+
@CONF[:BACK_TRACE_LIMIT] = 1
|
43
|
+
|
44
|
+
@CONF[:PROMPT][:SIMPLE] = { :PROMPT_I => "[\e[1m\e[31mWebCrawler::API\e[0m](%n)>> ",
|
45
|
+
:PROMPT_N => "[\e[1m\e[31mWebCrawler::API\e[0m](%n)>> ",
|
46
|
+
:PROMPT_S => "[\e[1m\e[31mWebCrawler::API\e[0m](%n)*",
|
47
|
+
:PROMPT_C => "(%n)?> ",
|
48
|
+
:RETURN => "\e[90m#=> %s\n\e[0m" }
|
49
|
+
|
50
|
+
@CONF[:PROMPT_MODE] = :SIMPLE
|
51
|
+
end
|
52
|
+
|
53
|
+
irb = IRB::Irb.new IRB::WorkSpace.new(@work_space)
|
54
|
+
|
55
|
+
|
56
|
+
IRB.instance_exec { @CONF[:IRB_RC].call(irb.context) if @CONF[:IRB_RC] }
|
57
|
+
IRB.instance_exec { @CONF[:MAIN_CONTEXT] = irb.context }
|
58
|
+
|
59
|
+
|
60
|
+
trap("SIGINT") do
|
61
|
+
irb.signal_handle
|
62
|
+
end
|
63
|
+
|
64
|
+
begin
|
65
|
+
catch(:IRB_EXIT) do
|
66
|
+
irb.eval_input
|
67
|
+
end
|
68
|
+
ensure
|
69
|
+
IRB.irb_at_exit
|
70
|
+
end
|
18
71
|
end
|
19
72
|
end
|
20
73
|
end
|
data/lib/web_crawler/view.rb
CHANGED
@@ -6,6 +6,7 @@ module WebCrawler::View
|
|
6
6
|
autoload :Plain, 'web_crawler/view/plain'
|
7
7
|
autoload :Table, 'web_crawler/view/table'
|
8
8
|
autoload :Runner, 'web_crawler/view/runner'
|
9
|
+
autoload :Yaml, 'web_crawler/view/yaml'
|
9
10
|
|
10
11
|
extend self
|
11
12
|
|
@@ -18,8 +19,9 @@ module WebCrawler::View
|
|
18
19
|
|
19
20
|
class << self
|
20
21
|
attr_accessor :default_options
|
22
|
+
|
21
23
|
def default_options
|
22
|
-
@default_options ||= { }
|
24
|
+
@default_options ||= { 'output' => $stdout }
|
23
25
|
end
|
24
26
|
end
|
25
27
|
|
@@ -32,13 +34,29 @@ module WebCrawler::View
|
|
32
34
|
[*input].map { |i| format(i) }.join
|
33
35
|
end
|
34
36
|
|
35
|
-
def draw(output
|
36
|
-
|
37
|
+
def draw(output=nil)
|
38
|
+
begin
|
39
|
+
present_output(output).puts render
|
40
|
+
ensure
|
41
|
+
output.close if output.respond_to? :close
|
42
|
+
end
|
37
43
|
end
|
38
44
|
|
39
45
|
def format(item)
|
40
46
|
item
|
41
47
|
end
|
48
|
+
|
49
|
+
protected
|
50
|
+
|
51
|
+
def present_output(override=nil)
|
52
|
+
@present_output = if override && override.respond_to?(:puts)
|
53
|
+
override
|
54
|
+
elsif @options['output'].is_a?(String)
|
55
|
+
File.open(@options['output'], 'w+')
|
56
|
+
elsif @options['output'].respond_to? :puts
|
57
|
+
@options['output']
|
58
|
+
end
|
59
|
+
end
|
42
60
|
end
|
43
61
|
|
44
62
|
end
|
data/web_crawler.gemspec
CHANGED
@@ -23,7 +23,8 @@ Gem::Specification.new do |s|
|
|
23
23
|
|
24
24
|
s.bindir = "bin"
|
25
25
|
|
26
|
-
s.add_dependency 'thor'
|
26
|
+
s.add_dependency 'thor', '>=0.14.6'
|
27
|
+
s.add_dependency 'mime-types', '>=1.16'
|
27
28
|
|
28
29
|
s.add_development_dependency(%q<rspec>, [">=2.6"])
|
29
30
|
s.add_development_dependency(%q<fakeweb>)
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 3
|
8
|
+
- 1
|
9
|
+
version: 0.3.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Anton Sozontov
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-06-03 00:00:00 +04:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -27,13 +27,29 @@ dependencies:
|
|
27
27
|
- !ruby/object:Gem::Version
|
28
28
|
segments:
|
29
29
|
- 0
|
30
|
-
|
30
|
+
- 14
|
31
|
+
- 6
|
32
|
+
version: 0.14.6
|
31
33
|
type: :runtime
|
32
34
|
version_requirements: *id001
|
33
35
|
- !ruby/object:Gem::Dependency
|
34
|
-
name:
|
36
|
+
name: mime-types
|
35
37
|
prerelease: false
|
36
38
|
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
segments:
|
44
|
+
- 1
|
45
|
+
- 16
|
46
|
+
version: "1.16"
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id002
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: rspec
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
37
53
|
none: false
|
38
54
|
requirements:
|
39
55
|
- - ">="
|
@@ -43,11 +59,11 @@ dependencies:
|
|
43
59
|
- 6
|
44
60
|
version: "2.6"
|
45
61
|
type: :development
|
46
|
-
version_requirements: *
|
62
|
+
version_requirements: *id003
|
47
63
|
- !ruby/object:Gem::Dependency
|
48
64
|
name: fakeweb
|
49
65
|
prerelease: false
|
50
|
-
requirement: &
|
66
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
51
67
|
none: false
|
52
68
|
requirements:
|
53
69
|
- - ">="
|
@@ -56,7 +72,7 @@ dependencies:
|
|
56
72
|
- 0
|
57
73
|
version: "0"
|
58
74
|
type: :development
|
59
|
-
version_requirements: *
|
75
|
+
version_requirements: *id004
|
60
76
|
description: Web crawler help you with parse and collect data from the web
|
61
77
|
email:
|
62
78
|
- a.sozontov@gmail.com
|
@@ -104,6 +120,7 @@ files:
|
|
104
120
|
- lib/web_crawler/view/runner.rb
|
105
121
|
- lib/web_crawler/view/table.rb
|
106
122
|
- lib/web_crawler/view/xml.rb
|
123
|
+
- lib/web_crawler/view/yaml.rb
|
107
124
|
- spec/fake_web_generator.rb
|
108
125
|
- spec/spec_helper.rb
|
109
126
|
- spec/web_crawler/batch_request_spec.rb
|