web_crawler 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +3 -0
- data/lib/web_crawler/batch_request.rb +1 -0
- data/lib/web_crawler/cli.rb +4 -0
- data/lib/web_crawler/response.rb +37 -1
- data/lib/web_crawler/version.rb +2 -2
- data/lib/web_crawler/view/runner.rb +57 -4
- data/lib/web_crawler/view/yaml.rb +9 -0
- data/lib/web_crawler/view.rb +21 -3
- data/web_crawler.gemspec +2 -1
- metadata +26 -9
data/Gemfile
CHANGED
data/lib/web_crawler/cli.rb
CHANGED
@@ -14,13 +14,16 @@ module WebCrawler
|
|
14
14
|
|
15
15
|
class_option :format, type: :string, desc: "output format [json, xml, csv]", default: 'plain'
|
16
16
|
class_option :json, type: :boolean, desc: "json output format. shortcut for --format json"
|
17
|
+
class_option :yaml, type: :boolean, desc: "yaml output format. shortcut for --format yaml"
|
17
18
|
class_option :xml, type: :boolean, desc: "xml output format. shortcut for --format xml"
|
18
19
|
class_option :csv, type: :boolean, desc: "csv output format. shortcut for --format csv"
|
19
20
|
class_option :table, type: :boolean, desc: "table output format. shortcut for --format table"
|
20
21
|
class_option :cached, type: :boolean, desc: "use cached requests. if ./tmp/cache exists use it for cache files"
|
21
22
|
class_option :follow, type: :boolean, desc: "follow to urls on the pages"
|
22
23
|
class_option :run, type: :string, desc: "run custom script with api access"
|
24
|
+
class_option :console, type: :boolean, desc: "run irb console after execution"
|
23
25
|
class_option :log, type: :string, desc: "log file path"
|
26
|
+
class_option :output, type: :string, desc: "output file path"
|
24
27
|
|
25
28
|
before_action except: :help do
|
26
29
|
@options = options.dup
|
@@ -29,6 +32,7 @@ module WebCrawler
|
|
29
32
|
@options[:format] = 'csv' if options[:csv]
|
30
33
|
@options[:format] = 'table' if options[:table]
|
31
34
|
@options[:format] = 'plain' if options[:plain]
|
35
|
+
@options[:format] = 'yaml' if options[:yaml]
|
32
36
|
|
33
37
|
@options[:original_format] = @options[:format] if options[:run]
|
34
38
|
@options[:format] = 'runner' if options[:run]
|
data/lib/web_crawler/response.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
+
require 'mime/types'
|
2
|
+
|
1
3
|
module WebCrawler
|
2
4
|
class Response
|
3
5
|
extend ::Forwardable
|
4
6
|
|
5
|
-
delegate [:
|
7
|
+
delegate [:http_version, :code, :message, :msg, :code_type, :[], :redirect_path, :redirect?] => '@response'
|
6
8
|
|
7
9
|
attr_reader :url, :expire, :date, :cached
|
8
10
|
|
@@ -13,6 +15,14 @@ module WebCrawler
|
|
13
15
|
@expire ||= Time.parse(self['Expires']) rescue Time.now
|
14
16
|
end
|
15
17
|
|
18
|
+
[:xml, :html, :json].each do |type|
|
19
|
+
class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
20
|
+
def #{type}?
|
21
|
+
mime_type.sub_type == '#{type}'
|
22
|
+
end
|
23
|
+
RUBY
|
24
|
+
end
|
25
|
+
|
16
26
|
def set_cached_flag
|
17
27
|
@cached = ' CACHED'
|
18
28
|
end
|
@@ -36,8 +46,34 @@ module WebCrawler
|
|
36
46
|
"#{redirected}>"
|
37
47
|
end
|
38
48
|
|
49
|
+
def mime_type
|
50
|
+
MIME::Types[header['content-type']].first
|
51
|
+
end
|
52
|
+
|
53
|
+
def header
|
54
|
+
@header ||= Hash[@response.to_hash.map(&:flatten)]
|
55
|
+
end
|
56
|
+
|
57
|
+
def body
|
58
|
+
type, encoding = self['Content-Type'].split("=")
|
59
|
+
@body ||= if encoding.upcase == 'UTF-8'
|
60
|
+
@response.body
|
61
|
+
else
|
62
|
+
encode_body(encoding.upcase)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
39
66
|
alias :to_s :body
|
40
67
|
|
68
|
+
def encode_body(from)
|
69
|
+
require "iconv" unless defined?(Iconv)
|
70
|
+
encoded = Iconv.iconv('UTF-8', from, @response.body).first
|
71
|
+
if xml?
|
72
|
+
encoded = encoded.gsub(/<\?xml version="(.*?)" encoding=".*?"\?>/, "<?xml version=\"1.0\" encoding=\"utf-8\"?>")
|
73
|
+
end
|
74
|
+
encoded
|
75
|
+
end
|
76
|
+
|
41
77
|
def type
|
42
78
|
@response.class
|
43
79
|
end
|
data/lib/web_crawler/version.rb
CHANGED
@@ -3,9 +3,22 @@ require "fileutils"
|
|
3
3
|
module WebCrawler::View
|
4
4
|
class Runner < Base
|
5
5
|
|
6
|
-
|
7
|
-
|
6
|
+
class WorkSpace
|
7
|
+
# array of responses
|
8
8
|
attr_accessor :responses
|
9
|
+
attr_accessor :results
|
10
|
+
|
11
|
+
def q
|
12
|
+
exit
|
13
|
+
end
|
14
|
+
|
15
|
+
def returning(value)
|
16
|
+
self.results = value
|
17
|
+
end
|
18
|
+
|
19
|
+
def method_missing(meth, *args, &block)
|
20
|
+
puts "\e[31m\e[1mError: method \"\e[0m\e[31m#{meth}\e[0m\e[31m\e[1m\" is missing\e[0m"
|
21
|
+
end
|
9
22
|
end
|
10
23
|
|
11
24
|
def render
|
@@ -13,8 +26,48 @@ module WebCrawler::View
|
|
13
26
|
@options['run'] = File.expand_path @options['run'], FileUtils.pwd
|
14
27
|
end
|
15
28
|
|
16
|
-
|
17
|
-
|
29
|
+
@work_space = WorkSpace.new
|
30
|
+
@work_space.responses = input.freeze
|
31
|
+
@work_space.results = eval(File.open(@options['run'], 'r').read, @work_space.instance_eval("binding"), @options['run'])
|
32
|
+
|
33
|
+
load_console! if @options['console']
|
34
|
+
|
35
|
+
WebCrawler::View.factory(@options['original_format'], @work_space.results, @options).render
|
36
|
+
end
|
37
|
+
|
38
|
+
def load_console!
|
39
|
+
require "irb"
|
40
|
+
IRB.init_config nil
|
41
|
+
IRB.instance_exec do
|
42
|
+
@CONF[:BACK_TRACE_LIMIT] = 1
|
43
|
+
|
44
|
+
@CONF[:PROMPT][:SIMPLE] = { :PROMPT_I => "[\e[1m\e[31mWebCrawler::API\e[0m](%n)>> ",
|
45
|
+
:PROMPT_N => "[\e[1m\e[31mWebCrawler::API\e[0m](%n)>> ",
|
46
|
+
:PROMPT_S => "[\e[1m\e[31mWebCrawler::API\e[0m](%n)*",
|
47
|
+
:PROMPT_C => "(%n)?> ",
|
48
|
+
:RETURN => "\e[90m#=> %s\n\e[0m" }
|
49
|
+
|
50
|
+
@CONF[:PROMPT_MODE] = :SIMPLE
|
51
|
+
end
|
52
|
+
|
53
|
+
irb = IRB::Irb.new IRB::WorkSpace.new(@work_space)
|
54
|
+
|
55
|
+
|
56
|
+
IRB.instance_exec { @CONF[:IRB_RC].call(irb.context) if @CONF[:IRB_RC] }
|
57
|
+
IRB.instance_exec { @CONF[:MAIN_CONTEXT] = irb.context }
|
58
|
+
|
59
|
+
|
60
|
+
trap("SIGINT") do
|
61
|
+
irb.signal_handle
|
62
|
+
end
|
63
|
+
|
64
|
+
begin
|
65
|
+
catch(:IRB_EXIT) do
|
66
|
+
irb.eval_input
|
67
|
+
end
|
68
|
+
ensure
|
69
|
+
IRB.irb_at_exit
|
70
|
+
end
|
18
71
|
end
|
19
72
|
end
|
20
73
|
end
|
data/lib/web_crawler/view.rb
CHANGED
@@ -6,6 +6,7 @@ module WebCrawler::View
|
|
6
6
|
autoload :Plain, 'web_crawler/view/plain'
|
7
7
|
autoload :Table, 'web_crawler/view/table'
|
8
8
|
autoload :Runner, 'web_crawler/view/runner'
|
9
|
+
autoload :Yaml, 'web_crawler/view/yaml'
|
9
10
|
|
10
11
|
extend self
|
11
12
|
|
@@ -18,8 +19,9 @@ module WebCrawler::View
|
|
18
19
|
|
19
20
|
class << self
|
20
21
|
attr_accessor :default_options
|
22
|
+
|
21
23
|
def default_options
|
22
|
-
@default_options ||= { }
|
24
|
+
@default_options ||= { 'output' => $stdout }
|
23
25
|
end
|
24
26
|
end
|
25
27
|
|
@@ -32,13 +34,29 @@ module WebCrawler::View
|
|
32
34
|
[*input].map { |i| format(i) }.join
|
33
35
|
end
|
34
36
|
|
35
|
-
def draw(output
|
36
|
-
|
37
|
+
def draw(output=nil)
|
38
|
+
begin
|
39
|
+
present_output(output).puts render
|
40
|
+
ensure
|
41
|
+
output.close if output.respond_to? :close
|
42
|
+
end
|
37
43
|
end
|
38
44
|
|
39
45
|
def format(item)
|
40
46
|
item
|
41
47
|
end
|
48
|
+
|
49
|
+
protected
|
50
|
+
|
51
|
+
def present_output(override=nil)
|
52
|
+
@present_output = if override && override.respond_to?(:puts)
|
53
|
+
override
|
54
|
+
elsif @options['output'].is_a?(String)
|
55
|
+
File.open(@options['output'], 'w+')
|
56
|
+
elsif @options['output'].respond_to? :puts
|
57
|
+
@options['output']
|
58
|
+
end
|
59
|
+
end
|
42
60
|
end
|
43
61
|
|
44
62
|
end
|
data/web_crawler.gemspec
CHANGED
@@ -23,7 +23,8 @@ Gem::Specification.new do |s|
|
|
23
23
|
|
24
24
|
s.bindir = "bin"
|
25
25
|
|
26
|
-
s.add_dependency 'thor'
|
26
|
+
s.add_dependency 'thor', '>=0.14.6'
|
27
|
+
s.add_dependency 'mime-types', '>=1.16'
|
27
28
|
|
28
29
|
s.add_development_dependency(%q<rspec>, [">=2.6"])
|
29
30
|
s.add_development_dependency(%q<fakeweb>)
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 3
|
8
|
+
- 1
|
9
|
+
version: 0.3.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Anton Sozontov
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-06-03 00:00:00 +04:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -27,13 +27,29 @@ dependencies:
|
|
27
27
|
- !ruby/object:Gem::Version
|
28
28
|
segments:
|
29
29
|
- 0
|
30
|
-
|
30
|
+
- 14
|
31
|
+
- 6
|
32
|
+
version: 0.14.6
|
31
33
|
type: :runtime
|
32
34
|
version_requirements: *id001
|
33
35
|
- !ruby/object:Gem::Dependency
|
34
|
-
name:
|
36
|
+
name: mime-types
|
35
37
|
prerelease: false
|
36
38
|
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
segments:
|
44
|
+
- 1
|
45
|
+
- 16
|
46
|
+
version: "1.16"
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id002
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: rspec
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
37
53
|
none: false
|
38
54
|
requirements:
|
39
55
|
- - ">="
|
@@ -43,11 +59,11 @@ dependencies:
|
|
43
59
|
- 6
|
44
60
|
version: "2.6"
|
45
61
|
type: :development
|
46
|
-
version_requirements: *
|
62
|
+
version_requirements: *id003
|
47
63
|
- !ruby/object:Gem::Dependency
|
48
64
|
name: fakeweb
|
49
65
|
prerelease: false
|
50
|
-
requirement: &
|
66
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
51
67
|
none: false
|
52
68
|
requirements:
|
53
69
|
- - ">="
|
@@ -56,7 +72,7 @@ dependencies:
|
|
56
72
|
- 0
|
57
73
|
version: "0"
|
58
74
|
type: :development
|
59
|
-
version_requirements: *
|
75
|
+
version_requirements: *id004
|
60
76
|
description: Web crawler help you with parse and collect data from the web
|
61
77
|
email:
|
62
78
|
- a.sozontov@gmail.com
|
@@ -104,6 +120,7 @@ files:
|
|
104
120
|
- lib/web_crawler/view/runner.rb
|
105
121
|
- lib/web_crawler/view/table.rb
|
106
122
|
- lib/web_crawler/view/xml.rb
|
123
|
+
- lib/web_crawler/view/yaml.rb
|
107
124
|
- spec/fake_web_generator.rb
|
108
125
|
- spec/spec_helper.rb
|
109
126
|
- spec/web_crawler/batch_request_spec.rb
|