web_crawler 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -1,5 +1,8 @@
1
1
  source :gemcutter
2
2
 
3
+ gem 'thor', '>=0.14.6'
4
+ gem 'mime-types', '>=1.16'
5
+
3
6
  # Specify your gem's dependencies in web_crawler.gemspec
4
7
  gemspec
5
8
 
@@ -20,6 +20,7 @@ module WebCrawler
20
20
  block_given? ? yield(@handler.process) : @handler.process
21
21
  else
22
22
  @responses ||= requests.map do |req|
23
+ WebCrawler.logger.info "start request to #{req.url.to_s}"
23
24
  block_given? ? yield(req.process) : req.process
24
25
  end
25
26
  end
@@ -14,13 +14,16 @@ module WebCrawler
14
14
 
15
15
  class_option :format, type: :string, desc: "output format [json, xml, csv]", default: 'plain'
16
16
  class_option :json, type: :boolean, desc: "json output format. shortcut for --format json"
17
+ class_option :yaml, type: :boolean, desc: "yaml output format. shortcut for --format yaml"
17
18
  class_option :xml, type: :boolean, desc: "xml output format. shortcut for --format xml"
18
19
  class_option :csv, type: :boolean, desc: "csv output format. shortcut for --format csv"
19
20
  class_option :table, type: :boolean, desc: "table output format. shortcut for --format table"
20
21
  class_option :cached, type: :boolean, desc: "use cached requests. if ./tmp/cache exists use it for cache files"
21
22
  class_option :follow, type: :boolean, desc: "follow to urls on the pages"
22
23
  class_option :run, type: :string, desc: "run custom script with api access"
24
+ class_option :console, type: :boolean, desc: "run irb console after execution"
23
25
  class_option :log, type: :string, desc: "log file path"
26
+ class_option :output, type: :string, desc: "output file path"
24
27
 
25
28
  before_action except: :help do
26
29
  @options = options.dup
@@ -29,6 +32,7 @@ module WebCrawler
29
32
  @options[:format] = 'csv' if options[:csv]
30
33
  @options[:format] = 'table' if options[:table]
31
34
  @options[:format] = 'plain' if options[:plain]
35
+ @options[:format] = 'yaml' if options[:yaml]
32
36
 
33
37
  @options[:original_format] = @options[:format] if options[:run]
34
38
  @options[:format] = 'runner' if options[:run]
@@ -1,8 +1,10 @@
1
+ require 'mime/types'
2
+
1
3
  module WebCrawler
2
4
  class Response
3
5
  extend ::Forwardable
4
6
 
5
- delegate [:body, :http_version, :code, :message, :msg, :code_type, :[], :redirect_path, :redirect?] => '@response'
7
+ delegate [:http_version, :code, :message, :msg, :code_type, :[], :redirect_path, :redirect?] => '@response'
6
8
 
7
9
  attr_reader :url, :expire, :date, :cached
8
10
 
@@ -13,6 +15,14 @@ module WebCrawler
13
15
  @expire ||= Time.parse(self['Expires']) rescue Time.now
14
16
  end
15
17
 
18
+ [:xml, :html, :json].each do |type|
19
+ class_eval <<-RUBY, __FILE__, __LINE__ + 1
20
+ def #{type}?
21
+ mime_type.sub_type == '#{type}'
22
+ end
23
+ RUBY
24
+ end
25
+
16
26
  def set_cached_flag
17
27
  @cached = ' CACHED'
18
28
  end
@@ -36,8 +46,34 @@ module WebCrawler
36
46
  "#{redirected}>"
37
47
  end
38
48
 
49
+ def mime_type
50
+ MIME::Types[header['content-type']].first
51
+ end
52
+
53
+ def header
54
+ @header ||= Hash[@response.to_hash.map(&:flatten)]
55
+ end
56
+
57
+ def body
58
+ type, encoding = self['Content-Type'].split("=")
59
+ @body ||= if encoding.upcase == 'UTF-8'
60
+ @response.body
61
+ else
62
+ encode_body(encoding.upcase)
63
+ end
64
+ end
65
+
39
66
  alias :to_s :body
40
67
 
68
+ def encode_body(from)
69
+ require "iconv" unless defined?(Iconv)
70
+ encoded = Iconv.iconv('UTF-8', from, @response.body).first
71
+ if xml?
72
+ encoded = encoded.gsub(/<\?xml version="(.*?)" encoding=".*?"\?>/, "<?xml version=\"1.0\" encoding=\"utf-8\"?>")
73
+ end
74
+ encoded
75
+ end
76
+
41
77
  def type
42
78
  @response.class
43
79
  end
@@ -1,8 +1,8 @@
1
1
  module WebCrawler
2
2
  module VERSION
3
3
  MAJOR = 0
4
- MINOR = 2
5
- TINY = 0
4
+ MINOR = 3
5
+ TINY = 1
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
@@ -3,9 +3,22 @@ require "fileutils"
3
3
  module WebCrawler::View
4
4
  class Runner < Base
5
5
 
6
- module Space
7
- extend self
6
+ class WorkSpace
7
+ # array of responses
8
8
  attr_accessor :responses
9
+ attr_accessor :results
10
+
11
+ def q
12
+ exit
13
+ end
14
+
15
+ def returning(value)
16
+ self.results = value
17
+ end
18
+
19
+ def method_missing(meth, *args, &block)
20
+ puts "\e[31m\e[1mError: method \"\e[0m\e[31m#{meth}\e[0m\e[31m\e[1m\" is missing\e[0m"
21
+ end
9
22
  end
10
23
 
11
24
  def render
@@ -13,8 +26,48 @@ module WebCrawler::View
13
26
  @options['run'] = File.expand_path @options['run'], FileUtils.pwd
14
27
  end
15
28
 
16
- Space.responses = input.freeze
17
- Space.module_eval(File.open(@options['run'], 'r').read)
29
+ @work_space = WorkSpace.new
30
+ @work_space.responses = input.freeze
31
+ @work_space.results = eval(File.open(@options['run'], 'r').read, @work_space.instance_eval("binding"), @options['run'])
32
+
33
+ load_console! if @options['console']
34
+
35
+ WebCrawler::View.factory(@options['original_format'], @work_space.results, @options).render
36
+ end
37
+
38
+ def load_console!
39
+ require "irb"
40
+ IRB.init_config nil
41
+ IRB.instance_exec do
42
+ @CONF[:BACK_TRACE_LIMIT] = 1
43
+
44
+ @CONF[:PROMPT][:SIMPLE] = { :PROMPT_I => "[\e[1m\e[31mWebCrawler::API\e[0m](%n)>> ",
45
+ :PROMPT_N => "[\e[1m\e[31mWebCrawler::API\e[0m](%n)>> ",
46
+ :PROMPT_S => "[\e[1m\e[31mWebCrawler::API\e[0m](%n)*",
47
+ :PROMPT_C => "(%n)?> ",
48
+ :RETURN => "\e[90m#=> %s\n\e[0m" }
49
+
50
+ @CONF[:PROMPT_MODE] = :SIMPLE
51
+ end
52
+
53
+ irb = IRB::Irb.new IRB::WorkSpace.new(@work_space)
54
+
55
+
56
+ IRB.instance_exec { @CONF[:IRB_RC].call(irb.context) if @CONF[:IRB_RC] }
57
+ IRB.instance_exec { @CONF[:MAIN_CONTEXT] = irb.context }
58
+
59
+
60
+ trap("SIGINT") do
61
+ irb.signal_handle
62
+ end
63
+
64
+ begin
65
+ catch(:IRB_EXIT) do
66
+ irb.eval_input
67
+ end
68
+ ensure
69
+ IRB.irb_at_exit
70
+ end
18
71
  end
19
72
  end
20
73
  end
@@ -0,0 +1,9 @@
1
+ require 'yaml'
2
+
3
+ module WebCrawler::View
4
+ class Yaml < Base
5
+ def render
6
+ YAML.dump(responses: input)
7
+ end
8
+ end
9
+ end
@@ -6,6 +6,7 @@ module WebCrawler::View
6
6
  autoload :Plain, 'web_crawler/view/plain'
7
7
  autoload :Table, 'web_crawler/view/table'
8
8
  autoload :Runner, 'web_crawler/view/runner'
9
+ autoload :Yaml, 'web_crawler/view/yaml'
9
10
 
10
11
  extend self
11
12
 
@@ -18,8 +19,9 @@ module WebCrawler::View
18
19
 
19
20
  class << self
20
21
  attr_accessor :default_options
22
+
21
23
  def default_options
22
- @default_options ||= { }
24
+ @default_options ||= { 'output' => $stdout }
23
25
  end
24
26
  end
25
27
 
@@ -32,13 +34,29 @@ module WebCrawler::View
32
34
  [*input].map { |i| format(i) }.join
33
35
  end
34
36
 
35
- def draw(output=$stdout)
36
- output.puts render
37
+ def draw(output=nil)
38
+ begin
39
+ present_output(output).puts render
40
+ ensure
41
+ output.close if output.respond_to? :close
42
+ end
37
43
  end
38
44
 
39
45
  def format(item)
40
46
  item
41
47
  end
48
+
49
+ protected
50
+
51
+ def present_output(override=nil)
52
+ @present_output = if override && override.respond_to?(:puts)
53
+ override
54
+ elsif @options['output'].is_a?(String)
55
+ File.open(@options['output'], 'w+')
56
+ elsif @options['output'].respond_to? :puts
57
+ @options['output']
58
+ end
59
+ end
42
60
  end
43
61
 
44
62
  end
data/web_crawler.gemspec CHANGED
@@ -23,7 +23,8 @@ Gem::Specification.new do |s|
23
23
 
24
24
  s.bindir = "bin"
25
25
 
26
- s.add_dependency 'thor'
26
+ s.add_dependency 'thor', '>=0.14.6'
27
+ s.add_dependency 'mime-types', '>=1.16'
27
28
 
28
29
  s.add_development_dependency(%q<rspec>, [">=2.6"])
29
30
  s.add_development_dependency(%q<fakeweb>)
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 2
8
- - 0
9
- version: 0.2.0
7
+ - 3
8
+ - 1
9
+ version: 0.3.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - Anton Sozontov
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-05-31 00:00:00 +04:00
17
+ date: 2011-06-03 00:00:00 +04:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -27,13 +27,29 @@ dependencies:
27
27
  - !ruby/object:Gem::Version
28
28
  segments:
29
29
  - 0
30
- version: "0"
30
+ - 14
31
+ - 6
32
+ version: 0.14.6
31
33
  type: :runtime
32
34
  version_requirements: *id001
33
35
  - !ruby/object:Gem::Dependency
34
- name: rspec
36
+ name: mime-types
35
37
  prerelease: false
36
38
  requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ segments:
44
+ - 1
45
+ - 16
46
+ version: "1.16"
47
+ type: :runtime
48
+ version_requirements: *id002
49
+ - !ruby/object:Gem::Dependency
50
+ name: rspec
51
+ prerelease: false
52
+ requirement: &id003 !ruby/object:Gem::Requirement
37
53
  none: false
38
54
  requirements:
39
55
  - - ">="
@@ -43,11 +59,11 @@ dependencies:
43
59
  - 6
44
60
  version: "2.6"
45
61
  type: :development
46
- version_requirements: *id002
62
+ version_requirements: *id003
47
63
  - !ruby/object:Gem::Dependency
48
64
  name: fakeweb
49
65
  prerelease: false
50
- requirement: &id003 !ruby/object:Gem::Requirement
66
+ requirement: &id004 !ruby/object:Gem::Requirement
51
67
  none: false
52
68
  requirements:
53
69
  - - ">="
@@ -56,7 +72,7 @@ dependencies:
56
72
  - 0
57
73
  version: "0"
58
74
  type: :development
59
- version_requirements: *id003
75
+ version_requirements: *id004
60
76
  description: Web crawler help you with parse and collect data from the web
61
77
  email:
62
78
  - a.sozontov@gmail.com
@@ -104,6 +120,7 @@ files:
104
120
  - lib/web_crawler/view/runner.rb
105
121
  - lib/web_crawler/view/table.rb
106
122
  - lib/web_crawler/view/xml.rb
123
+ - lib/web_crawler/view/yaml.rb
107
124
  - spec/fake_web_generator.rb
108
125
  - spec/spec_helper.rb
109
126
  - spec/web_crawler/batch_request_spec.rb