web_crawler 0.2.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -1,5 +1,8 @@
1
1
  source :gemcutter
2
2
 
3
+ gem 'thor', '>=0.14.6'
4
+ gem 'mime-types', '>=1.16'
5
+
3
6
  # Specify your gem's dependencies in web_crawler.gemspec
4
7
  gemspec
5
8
 
@@ -20,6 +20,7 @@ module WebCrawler
20
20
  block_given? ? yield(@handler.process) : @handler.process
21
21
  else
22
22
  @responses ||= requests.map do |req|
23
+ WebCrawler.logger.info "start request to #{req.url.to_s}"
23
24
  block_given? ? yield(req.process) : req.process
24
25
  end
25
26
  end
@@ -14,13 +14,16 @@ module WebCrawler
14
14
 
15
15
  class_option :format, type: :string, desc: "output format [json, xml, csv]", default: 'plain'
16
16
  class_option :json, type: :boolean, desc: "json output format. shortcut for --format json"
17
+ class_option :yaml, type: :boolean, desc: "yaml output format. shortcut for --format yaml"
17
18
  class_option :xml, type: :boolean, desc: "xml output format. shortcut for --format xml"
18
19
  class_option :csv, type: :boolean, desc: "csv output format. shortcut for --format csv"
19
20
  class_option :table, type: :boolean, desc: "table output format. shortcut for --format table"
20
21
  class_option :cached, type: :boolean, desc: "use cached requests. if ./tmp/cache exists use it for cache files"
21
22
  class_option :follow, type: :boolean, desc: "follow to urls on the pages"
22
23
  class_option :run, type: :string, desc: "run custom script with api access"
24
+ class_option :console, type: :boolean, desc: "run irb console after execution"
23
25
  class_option :log, type: :string, desc: "log file path"
26
+ class_option :output, type: :string, desc: "output file path"
24
27
 
25
28
  before_action except: :help do
26
29
  @options = options.dup
@@ -29,6 +32,7 @@ module WebCrawler
29
32
  @options[:format] = 'csv' if options[:csv]
30
33
  @options[:format] = 'table' if options[:table]
31
34
  @options[:format] = 'plain' if options[:plain]
35
+ @options[:format] = 'yaml' if options[:yaml]
32
36
 
33
37
  @options[:original_format] = @options[:format] if options[:run]
34
38
  @options[:format] = 'runner' if options[:run]
@@ -1,8 +1,10 @@
1
+ require 'mime/types'
2
+
1
3
  module WebCrawler
2
4
  class Response
3
5
  extend ::Forwardable
4
6
 
5
- delegate [:body, :http_version, :code, :message, :msg, :code_type, :[], :redirect_path, :redirect?] => '@response'
7
+ delegate [:http_version, :code, :message, :msg, :code_type, :[], :redirect_path, :redirect?] => '@response'
6
8
 
7
9
  attr_reader :url, :expire, :date, :cached
8
10
 
@@ -13,6 +15,14 @@ module WebCrawler
13
15
  @expire ||= Time.parse(self['Expires']) rescue Time.now
14
16
  end
15
17
 
18
+ [:xml, :html, :json].each do |type|
19
+ class_eval <<-RUBY, __FILE__, __LINE__ + 1
20
+ def #{type}?
21
+ mime_type.sub_type == '#{type}'
22
+ end
23
+ RUBY
24
+ end
25
+
16
26
  def set_cached_flag
17
27
  @cached = ' CACHED'
18
28
  end
@@ -36,8 +46,34 @@ module WebCrawler
36
46
  "#{redirected}>"
37
47
  end
38
48
 
49
+ def mime_type
50
+ MIME::Types[header['content-type']].first
51
+ end
52
+
53
+ def header
54
+ @header ||= Hash[@response.to_hash.map(&:flatten)]
55
+ end
56
+
57
+ def body
58
+ type, encoding = self['Content-Type'].split("=")
59
+ @body ||= if encoding.upcase == 'UTF-8'
60
+ @response.body
61
+ else
62
+ encode_body(encoding.upcase)
63
+ end
64
+ end
65
+
39
66
  alias :to_s :body
40
67
 
68
+ def encode_body(from)
69
+ require "iconv" unless defined?(Iconv)
70
+ encoded = Iconv.iconv('UTF-8', from, @response.body).first
71
+ if xml?
72
+ encoded = encoded.gsub(/<\?xml version="(.*?)" encoding=".*?"\?>/, "<?xml version=\"1.0\" encoding=\"utf-8\"?>")
73
+ end
74
+ encoded
75
+ end
76
+
41
77
  def type
42
78
  @response.class
43
79
  end
@@ -1,8 +1,8 @@
1
1
  module WebCrawler
2
2
  module VERSION
3
3
  MAJOR = 0
4
- MINOR = 2
5
- TINY = 0
4
+ MINOR = 3
5
+ TINY = 1
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
@@ -3,9 +3,22 @@ require "fileutils"
3
3
  module WebCrawler::View
4
4
  class Runner < Base
5
5
 
6
- module Space
7
- extend self
6
+ class WorkSpace
7
+ # array of responses
8
8
  attr_accessor :responses
9
+ attr_accessor :results
10
+
11
+ def q
12
+ exit
13
+ end
14
+
15
+ def returning(value)
16
+ self.results = value
17
+ end
18
+
19
+ def method_missing(meth, *args, &block)
20
+ puts "\e[31m\e[1mError: method \"\e[0m\e[31m#{meth}\e[0m\e[31m\e[1m\" is missing\e[0m"
21
+ end
9
22
  end
10
23
 
11
24
  def render
@@ -13,8 +26,48 @@ module WebCrawler::View
13
26
  @options['run'] = File.expand_path @options['run'], FileUtils.pwd
14
27
  end
15
28
 
16
- Space.responses = input.freeze
17
- Space.module_eval(File.open(@options['run'], 'r').read)
29
+ @work_space = WorkSpace.new
30
+ @work_space.responses = input.freeze
31
+ @work_space.results = eval(File.open(@options['run'], 'r').read, @work_space.instance_eval("binding"), @options['run'])
32
+
33
+ load_console! if @options['console']
34
+
35
+ WebCrawler::View.factory(@options['original_format'], @work_space.results, @options).render
36
+ end
37
+
38
+ def load_console!
39
+ require "irb"
40
+ IRB.init_config nil
41
+ IRB.instance_exec do
42
+ @CONF[:BACK_TRACE_LIMIT] = 1
43
+
44
+ @CONF[:PROMPT][:SIMPLE] = { :PROMPT_I => "[\e[1m\e[31mWebCrawler::API\e[0m](%n)>> ",
45
+ :PROMPT_N => "[\e[1m\e[31mWebCrawler::API\e[0m](%n)>> ",
46
+ :PROMPT_S => "[\e[1m\e[31mWebCrawler::API\e[0m](%n)*",
47
+ :PROMPT_C => "(%n)?> ",
48
+ :RETURN => "\e[90m#=> %s\n\e[0m" }
49
+
50
+ @CONF[:PROMPT_MODE] = :SIMPLE
51
+ end
52
+
53
+ irb = IRB::Irb.new IRB::WorkSpace.new(@work_space)
54
+
55
+
56
+ IRB.instance_exec { @CONF[:IRB_RC].call(irb.context) if @CONF[:IRB_RC] }
57
+ IRB.instance_exec { @CONF[:MAIN_CONTEXT] = irb.context }
58
+
59
+
60
+ trap("SIGINT") do
61
+ irb.signal_handle
62
+ end
63
+
64
+ begin
65
+ catch(:IRB_EXIT) do
66
+ irb.eval_input
67
+ end
68
+ ensure
69
+ IRB.irb_at_exit
70
+ end
18
71
  end
19
72
  end
20
73
  end
@@ -0,0 +1,9 @@
1
+ require 'yaml'
2
+
3
+ module WebCrawler::View
4
+ class Yaml < Base
5
+ def render
6
+ YAML.dump(responses: input)
7
+ end
8
+ end
9
+ end
@@ -6,6 +6,7 @@ module WebCrawler::View
6
6
  autoload :Plain, 'web_crawler/view/plain'
7
7
  autoload :Table, 'web_crawler/view/table'
8
8
  autoload :Runner, 'web_crawler/view/runner'
9
+ autoload :Yaml, 'web_crawler/view/yaml'
9
10
 
10
11
  extend self
11
12
 
@@ -18,8 +19,9 @@ module WebCrawler::View
18
19
 
19
20
  class << self
20
21
  attr_accessor :default_options
22
+
21
23
  def default_options
22
- @default_options ||= { }
24
+ @default_options ||= { 'output' => $stdout }
23
25
  end
24
26
  end
25
27
 
@@ -32,13 +34,29 @@ module WebCrawler::View
32
34
  [*input].map { |i| format(i) }.join
33
35
  end
34
36
 
35
- def draw(output=$stdout)
36
- output.puts render
37
+ def draw(output=nil)
38
+ begin
39
+ present_output(output).puts render
40
+ ensure
41
+ output.close if output.respond_to? :close
42
+ end
37
43
  end
38
44
 
39
45
  def format(item)
40
46
  item
41
47
  end
48
+
49
+ protected
50
+
51
+ def present_output(override=nil)
52
+ @present_output = if override && override.respond_to?(:puts)
53
+ override
54
+ elsif @options['output'].is_a?(String)
55
+ File.open(@options['output'], 'w+')
56
+ elsif @options['output'].respond_to? :puts
57
+ @options['output']
58
+ end
59
+ end
42
60
  end
43
61
 
44
62
  end
data/web_crawler.gemspec CHANGED
@@ -23,7 +23,8 @@ Gem::Specification.new do |s|
23
23
 
24
24
  s.bindir = "bin"
25
25
 
26
- s.add_dependency 'thor'
26
+ s.add_dependency 'thor', '>=0.14.6'
27
+ s.add_dependency 'mime-types', '>=1.16'
27
28
 
28
29
  s.add_development_dependency(%q<rspec>, [">=2.6"])
29
30
  s.add_development_dependency(%q<fakeweb>)
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 2
8
- - 0
9
- version: 0.2.0
7
+ - 3
8
+ - 1
9
+ version: 0.3.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - Anton Sozontov
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-05-31 00:00:00 +04:00
17
+ date: 2011-06-03 00:00:00 +04:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -27,13 +27,29 @@ dependencies:
27
27
  - !ruby/object:Gem::Version
28
28
  segments:
29
29
  - 0
30
- version: "0"
30
+ - 14
31
+ - 6
32
+ version: 0.14.6
31
33
  type: :runtime
32
34
  version_requirements: *id001
33
35
  - !ruby/object:Gem::Dependency
34
- name: rspec
36
+ name: mime-types
35
37
  prerelease: false
36
38
  requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ segments:
44
+ - 1
45
+ - 16
46
+ version: "1.16"
47
+ type: :runtime
48
+ version_requirements: *id002
49
+ - !ruby/object:Gem::Dependency
50
+ name: rspec
51
+ prerelease: false
52
+ requirement: &id003 !ruby/object:Gem::Requirement
37
53
  none: false
38
54
  requirements:
39
55
  - - ">="
@@ -43,11 +59,11 @@ dependencies:
43
59
  - 6
44
60
  version: "2.6"
45
61
  type: :development
46
- version_requirements: *id002
62
+ version_requirements: *id003
47
63
  - !ruby/object:Gem::Dependency
48
64
  name: fakeweb
49
65
  prerelease: false
50
- requirement: &id003 !ruby/object:Gem::Requirement
66
+ requirement: &id004 !ruby/object:Gem::Requirement
51
67
  none: false
52
68
  requirements:
53
69
  - - ">="
@@ -56,7 +72,7 @@ dependencies:
56
72
  - 0
57
73
  version: "0"
58
74
  type: :development
59
- version_requirements: *id003
75
+ version_requirements: *id004
60
76
  description: Web crawler help you with parse and collect data from the web
61
77
  email:
62
78
  - a.sozontov@gmail.com
@@ -104,6 +120,7 @@ files:
104
120
  - lib/web_crawler/view/runner.rb
105
121
  - lib/web_crawler/view/table.rb
106
122
  - lib/web_crawler/view/xml.rb
123
+ - lib/web_crawler/view/yaml.rb
107
124
  - spec/fake_web_generator.rb
108
125
  - spec/spec_helper.rb
109
126
  - spec/web_crawler/batch_request_spec.rb