maxwell 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0b4a07c10e6638f009e5d96fa34d98d2c92985aa
4
- data.tar.gz: 641ac72166235db4b28e7ccebee77e8da1b7c3e0
3
+ metadata.gz: 3062e2f1c6b55eb5cc7318da36e44986f1d73fd2
4
+ data.tar.gz: d95d0c7e51a9d0dfe383bca8d09c22b3c26431e6
5
5
  SHA512:
6
- metadata.gz: 04d92bce282bbbf263c83b1434f05cbd9043a169bf893a360a80701755d645aef03c60af4a69187ac763c8a1fa95e1596854d175fd0aaa6d66c787c3eab36081
7
- data.tar.gz: 8cfb57804bc5575573a30cb10166cf46b8c6761969ca8bbdef1accd758c16d0823a9c6c203103bf45a244303f9c54f8c26ff1f709425d3c944e7ec7767e7819b
6
+ metadata.gz: 1bc5e93ee5211c5c9a4c1a5ed6390e6969bba4912677b7ad8a9e015b81b5f9106b7c5bf8302ef9d673a8995f8bbce4ca0d9379d73668d13cb22a6598bd6a91f6
7
+ data.tar.gz: 7f0e764c5e8a6b8ac3e43ec03dc2efc2b199a61cc7fc5cbc77534496e2a3d52e49e66f7dbc1725963982991ed230323b48def1b4e6769eb017d091a50cb567c5
data/README.md CHANGED
@@ -4,16 +4,10 @@ Maxwell makes web scraping more simpler and faster with Ruby.
4
4
 
5
5
  ## Installation
6
6
 
7
- Add this line to your application's Gemfile:
8
-
9
7
  ```ruby
10
- gem 'maxwell'
8
+ echo "gem 'maxwell'" >> Gemfile && bundle
11
9
  ```
12
10
 
13
- And then execute:
14
-
15
- $ bundle
16
-
17
11
  Or install it yourself as:
18
12
 
19
13
  $ gem install maxwell
@@ -22,21 +16,25 @@ Or install it yourself as:
22
16
 
23
17
  ```ruby
24
18
  class YahooScraper < Maxwell::Base
25
- attr_scrape :title, :url, :address
19
+ attr_accessor :title, :url, :address
26
20
 
27
- regist_strategy("h3.slcHead.cFix a") do
28
- @title = @html.title
29
- @url = @html.css("td.sdhk jdj").text
30
- @address = @html.css("table tr.ddad").text
21
+ javascript true
22
+
23
+ concurrency 4
24
+
25
+ def parser html
26
+ @title = html.title
27
+ @url = html.css("td.sdhk jdj").text
28
+ @address = html.css("table tr.ddad").text
31
29
  end
32
30
 
33
- regist_handler do |result|
31
+ def handler result
34
32
  p result
33
+ #=> { title: "...", url: "...", address: "..." }
35
34
  end
36
-
37
35
  end
38
36
 
39
- YahooScraper.new.execute("https://www.yahoo.com/")
37
+ YahooScraper.execute ["https://www.yahoo.com/"]
40
38
  ```
41
39
 
42
40
  ## Development
@@ -38,7 +38,9 @@ module Maxwell
38
38
 
39
39
  session.driver.headers = { 'User-Agent' => @user_agent }
40
40
  session.visit url
41
- Nokogiri::HTML(session.html)
41
+ Nokogiri::HTML(session.html).tap do
42
+ session.driver.quit
43
+ end
42
44
  end
43
45
  end
44
46
  end
@@ -0,0 +1,8 @@
1
+ module Maxwell
2
+ class Helper
3
+ def self.open_links(url, link_selectore, use_poltergeist=false)
4
+ html = ::Maxwell::Converter.call(url, use_poltergeist)
5
+ html.css(link_selectore).map { |a| a[:href] }
6
+ end
7
+ end
8
+ end
@@ -1,3 +1,3 @@
1
1
  module Maxwell
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/maxwell.rb CHANGED
@@ -1,86 +1,62 @@
1
+ require 'parallel'
2
+
1
3
  require "maxwell/converter"
4
+ require "maxwell/helper"
2
5
 
3
6
  module Maxwell
7
+ class NoParserDefinedErr; end
8
+
4
9
  class Base
5
10
  class << self
6
- def attr_scrape(*attr_scrapes)
7
- @acquirer_class = Class.new do
8
- attr_accessor *attr_scrapes
9
- @attr_scrapes = attr_scrapes
11
+ def execute(urls)
12
+ Parallel.
13
+ map_with_index(urls, in_threads: @concurrency || 1) do |url, id|
14
+ p "scraping: #{ id + 1 }"
10
15
 
11
- def self.attr_scrapes
12
- @attr_scrapes
13
- end
16
+ scraper = self.new
17
+ html = Maxwell::Converter.call(url, @use_poltergeist)
14
18
 
15
- def initialize(nokogiri_obj)
16
- @html = nokogiri_obj
17
- end
19
+ scraper.parser html
18
20
 
19
- def result
20
- self.class.attr_scrapes.map { |k| [k, send(k)] }.to_h
21
+ scraper.handler ({ id: id + 1 }).merge(scraper.result)
21
22
  end
22
- end
23
23
  end
24
24
 
25
- def regist_strategy(link_selectore=nil, &strategy_blk)
26
- @link_selectore = link_selectore
27
- @strategy_blk = strategy_blk
25
+ def attr_accessor(*attrs)
26
+ @attrs ||= []
27
+ @attrs.concat attrs
28
+ super
28
29
  end
29
30
 
30
- def regist_handler(&handler_blk)
31
- @handler_blk = handler_blk
31
+ def attrs
32
+ @attrs
32
33
  end
33
34
 
34
- def use_poltergeist(value)
35
+ def javascript(value)
35
36
  @use_poltergeist = value
36
37
  end
37
- end
38
38
 
39
- def execute(root_url)
40
- if self.link_selectore
41
- html = Maxwell::Converter.call(root_url, use_poltergeist)
42
- html.css(self.link_selectore).each do |a|
43
- execute_for_result a[:href]
44
- end
45
- else
46
- execute_for_result root_url
39
+ def concurrency(value)
40
+ @concurrency = value
47
41
  end
48
42
  end
49
43
 
50
- def use_poltergeist
51
- self.class.instance_eval("@use_poltergeist")
52
- end
53
-
54
- def link_selectore
55
- self.class.instance_eval("@link_selectore")
56
- end
57
-
58
- def strategy_blk
59
- self.class.instance_eval("@strategy_blk")
44
+ def parser html
45
+ raise NoParserDefinedErr "You need to define #{self}#parser"
60
46
  end
61
47
 
62
- def handler_blk
63
- self.class.instance_eval("@handler_blk")
48
+ def handler result
49
+ p result
64
50
  end
65
51
 
66
- def acquirer_class
67
- self.class.instance_eval("@acquirer_class")
52
+ def result
53
+ self.class.attrs.map { |k| [k, self.send(k)] }.to_h
68
54
  end
69
-
70
- private
71
- def execute_for_result(tip_url)
72
- acquirer = acquirer_class.new(Maxwell::Converter.call(tip_url, use_poltergeist))
73
- acquirer.instance_eval &self.strategy_blk
74
-
75
- acquirer.result.tap do |result|
76
- self.handler_blk.call(result) if self.handler_blk
77
- end
78
- end
79
55
  end
80
56
  end
81
57
 
82
58
  class ::String
83
59
  def trim
84
- delete("\r\n\t")
60
+ delete "\r\n\t"
85
61
  end
86
62
  end
data/maxwell.gemspec CHANGED
@@ -23,6 +23,7 @@ Gem::Specification.new do |spec|
23
23
  spec.add_dependency "httpclient"
24
24
  spec.add_dependency 'poltergeist'
25
25
  spec.add_dependency 'capybara'
26
+ spec.add_dependency 'parallel'
26
27
 
27
28
  spec.add_development_dependency "bundler", "~> 1.10"
28
29
  spec.add_development_dependency "rake", "~> 10.0"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: maxwell
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - gogotanaka
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-02-18 00:00:00.000000000 Z
11
+ date: 2016-03-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: parallel
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: bundler
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -128,6 +142,7 @@ files:
128
142
  - exe/maxwell
129
143
  - lib/maxwell.rb
130
144
  - lib/maxwell/converter.rb
145
+ - lib/maxwell/helper.rb
131
146
  - lib/maxwell/version.rb
132
147
  - maxwell.gemspec
133
148
  homepage: http://gogotanaka.me/
@@ -150,7 +165,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
150
165
  version: '0'
151
166
  requirements: []
152
167
  rubyforge_project:
153
- rubygems_version: 2.5.1
168
+ rubygems_version: 2.4.5.1
154
169
  signing_key:
155
170
  specification_version: 4
156
171
  summary: Maxwell makes web scraping more simpler and faster with Ruby.