maxwell 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0b4a07c10e6638f009e5d96fa34d98d2c92985aa
4
- data.tar.gz: 641ac72166235db4b28e7ccebee77e8da1b7c3e0
3
+ metadata.gz: 3062e2f1c6b55eb5cc7318da36e44986f1d73fd2
4
+ data.tar.gz: d95d0c7e51a9d0dfe383bca8d09c22b3c26431e6
5
5
  SHA512:
6
- metadata.gz: 04d92bce282bbbf263c83b1434f05cbd9043a169bf893a360a80701755d645aef03c60af4a69187ac763c8a1fa95e1596854d175fd0aaa6d66c787c3eab36081
7
- data.tar.gz: 8cfb57804bc5575573a30cb10166cf46b8c6761969ca8bbdef1accd758c16d0823a9c6c203103bf45a244303f9c54f8c26ff1f709425d3c944e7ec7767e7819b
6
+ metadata.gz: 1bc5e93ee5211c5c9a4c1a5ed6390e6969bba4912677b7ad8a9e015b81b5f9106b7c5bf8302ef9d673a8995f8bbce4ca0d9379d73668d13cb22a6598bd6a91f6
7
+ data.tar.gz: 7f0e764c5e8a6b8ac3e43ec03dc2efc2b199a61cc7fc5cbc77534496e2a3d52e49e66f7dbc1725963982991ed230323b48def1b4e6769eb017d091a50cb567c5
data/README.md CHANGED
@@ -4,16 +4,10 @@ Maxwell makes web scraping more simpler and faster with Ruby.
4
4
 
5
5
  ## Installation
6
6
 
7
- Add this line to your application's Gemfile:
8
-
9
7
  ```ruby
10
- gem 'maxwell'
8
+ echo "gem 'maxwell'" >> Gemfile && bundle
11
9
  ```
12
10
 
13
- And then execute:
14
-
15
- $ bundle
16
-
17
11
  Or install it yourself as:
18
12
 
19
13
  $ gem install maxwell
@@ -22,21 +16,25 @@ Or install it yourself as:
22
16
 
23
17
  ```ruby
24
18
  class YahooScraper < Maxwell::Base
25
- attr_scrape :title, :url, :address
19
+ attr_accessor :title, :url, :address
26
20
 
27
- regist_strategy("h3.slcHead.cFix a") do
28
- @title = @html.title
29
- @url = @html.css("td.sdhk jdj").text
30
- @address = @html.css("table tr.ddad").text
21
+ javascript true
22
+
23
+ concurrency 4
24
+
25
+ def parser html
26
+ @title = html.title
27
+ @url = html.css("td.sdhk jdj").text
28
+ @address = html.css("table tr.ddad").text
31
29
  end
32
30
 
33
- regist_handler do |result|
31
+ def handler result
34
32
  p result
33
+ #=> { title: "...", url: "...", address: "..." }
35
34
  end
36
-
37
35
  end
38
36
 
39
- YahooScraper.new.execute("https://www.yahoo.com/")
37
+ YahooScraper.execute ["https://www.yahoo.com/"]
40
38
  ```
41
39
 
42
40
  ## Development
@@ -38,7 +38,9 @@ module Maxwell
38
38
 
39
39
  session.driver.headers = { 'User-Agent' => @user_agent }
40
40
  session.visit url
41
- Nokogiri::HTML(session.html)
41
+ Nokogiri::HTML(session.html).tap do
42
+ session.driver.quit
43
+ end
42
44
  end
43
45
  end
44
46
  end
@@ -0,0 +1,8 @@
1
+ module Maxwell
2
+ class Helper
3
+ def self.open_links(url, link_selectore, use_poltergeist=false)
4
+ html = ::Maxwell::Converter.call(url, use_poltergeist)
5
+ html.css(link_selectore).map { |a| a[:href] }
6
+ end
7
+ end
8
+ end
@@ -1,3 +1,3 @@
1
1
  module Maxwell
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/maxwell.rb CHANGED
@@ -1,86 +1,62 @@
1
+ require 'parallel'
2
+
1
3
  require "maxwell/converter"
4
+ require "maxwell/helper"
2
5
 
3
6
  module Maxwell
7
+ class NoParserDefinedErr; end
8
+
4
9
  class Base
5
10
  class << self
6
- def attr_scrape(*attr_scrapes)
7
- @acquirer_class = Class.new do
8
- attr_accessor *attr_scrapes
9
- @attr_scrapes = attr_scrapes
11
+ def execute(urls)
12
+ Parallel.
13
+ map_with_index(urls, in_threads: @concurrency || 1) do |url, id|
14
+ p "scraping: #{ id + 1 }"
10
15
 
11
- def self.attr_scrapes
12
- @attr_scrapes
13
- end
16
+ scraper = self.new
17
+ html = Maxwell::Converter.call(url, @use_poltergeist)
14
18
 
15
- def initialize(nokogiri_obj)
16
- @html = nokogiri_obj
17
- end
19
+ scraper.parser html
18
20
 
19
- def result
20
- self.class.attr_scrapes.map { |k| [k, send(k)] }.to_h
21
+ scraper.handler ({ id: id + 1 }).merge(scraper.result)
21
22
  end
22
- end
23
23
  end
24
24
 
25
- def regist_strategy(link_selectore=nil, &strategy_blk)
26
- @link_selectore = link_selectore
27
- @strategy_blk = strategy_blk
25
+ def attr_accessor(*attrs)
26
+ @attrs ||= []
27
+ @attrs.concat attrs
28
+ super
28
29
  end
29
30
 
30
- def regist_handler(&handler_blk)
31
- @handler_blk = handler_blk
31
+ def attrs
32
+ @attrs
32
33
  end
33
34
 
34
- def use_poltergeist(value)
35
+ def javascript(value)
35
36
  @use_poltergeist = value
36
37
  end
37
- end
38
38
 
39
- def execute(root_url)
40
- if self.link_selectore
41
- html = Maxwell::Converter.call(root_url, use_poltergeist)
42
- html.css(self.link_selectore).each do |a|
43
- execute_for_result a[:href]
44
- end
45
- else
46
- execute_for_result root_url
39
+ def concurrency(value)
40
+ @concurrency = value
47
41
  end
48
42
  end
49
43
 
50
- def use_poltergeist
51
- self.class.instance_eval("@use_poltergeist")
52
- end
53
-
54
- def link_selectore
55
- self.class.instance_eval("@link_selectore")
56
- end
57
-
58
- def strategy_blk
59
- self.class.instance_eval("@strategy_blk")
44
+ def parser html
45
+ raise NoParserDefinedErr "You need to define #{self}#parser"
60
46
  end
61
47
 
62
- def handler_blk
63
- self.class.instance_eval("@handler_blk")
48
+ def handler result
49
+ p result
64
50
  end
65
51
 
66
- def acquirer_class
67
- self.class.instance_eval("@acquirer_class")
52
+ def result
53
+ self.class.attrs.map { |k| [k, self.send(k)] }.to_h
68
54
  end
69
-
70
- private
71
- def execute_for_result(tip_url)
72
- acquirer = acquirer_class.new(Maxwell::Converter.call(tip_url, use_poltergeist))
73
- acquirer.instance_eval &self.strategy_blk
74
-
75
- acquirer.result.tap do |result|
76
- self.handler_blk.call(result) if self.handler_blk
77
- end
78
- end
79
55
  end
80
56
  end
81
57
 
82
58
  class ::String
83
59
  def trim
84
- delete("\r\n\t")
60
+ delete "\r\n\t"
85
61
  end
86
62
  end
data/maxwell.gemspec CHANGED
@@ -23,6 +23,7 @@ Gem::Specification.new do |spec|
23
23
  spec.add_dependency "httpclient"
24
24
  spec.add_dependency 'poltergeist'
25
25
  spec.add_dependency 'capybara'
26
+ spec.add_dependency 'parallel'
26
27
 
27
28
  spec.add_development_dependency "bundler", "~> 1.10"
28
29
  spec.add_development_dependency "rake", "~> 10.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: maxwell
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - gogotanaka
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-02-18 00:00:00.000000000 Z
11
+ date: 2016-03-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: parallel
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: bundler
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -128,6 +142,7 @@ files:
128
142
  - exe/maxwell
129
143
  - lib/maxwell.rb
130
144
  - lib/maxwell/converter.rb
145
+ - lib/maxwell/helper.rb
131
146
  - lib/maxwell/version.rb
132
147
  - maxwell.gemspec
133
148
  homepage: http://gogotanaka.me/
@@ -150,7 +165,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
150
165
  version: '0'
151
166
  requirements: []
152
167
  rubyforge_project:
153
- rubygems_version: 2.5.1
168
+ rubygems_version: 2.4.5.1
154
169
  signing_key:
155
170
  specification_version: 4
156
171
  summary: Maxwell makes web scraping more simpler and faster with Ruby.