maxwell 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +15 -12
- data/lib/maxwell/converter.rb +36 -13
- data/lib/maxwell/version.rb +2 -2
- data/lib/maxwell.rb +17 -5
- data/maxwell.gemspec +2 -0
- metadata +30 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0b4a07c10e6638f009e5d96fa34d98d2c92985aa
|
4
|
+
data.tar.gz: 641ac72166235db4b28e7ccebee77e8da1b7c3e0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 04d92bce282bbbf263c83b1434f05cbd9043a169bf893a360a80701755d645aef03c60af4a69187ac763c8a1fa95e1596854d175fd0aaa6d66c787c3eab36081
|
7
|
+
data.tar.gz: 8cfb57804bc5575573a30cb10166cf46b8c6761969ca8bbdef1accd758c16d0823a9c6c203103bf45a244303f9c54f8c26ff1f709425d3c944e7ec7767e7819b
|
data/README.md
CHANGED
@@ -21,19 +21,22 @@ Or install it yourself as:
|
|
21
21
|
## Usage
|
22
22
|
|
23
23
|
```ruby
|
24
|
-
Maxwell::
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
24
|
+
class YahooScraper < Maxwell::Base
|
25
|
+
attr_scrape :title, :url, :address
|
26
|
+
|
27
|
+
regist_strategy("h3.slcHead.cFix a") do
|
28
|
+
@title = @html.title
|
29
|
+
@url = @html.css("td.sdhk jdj").text
|
30
|
+
@address = @html.css("table tr.ddad").text
|
31
|
+
end
|
32
|
+
|
33
|
+
regist_handler do |result|
|
34
|
+
p result
|
35
|
+
end
|
36
|
+
|
36
37
|
end
|
38
|
+
|
39
|
+
YahooScraper.new.execute("https://www.yahoo.com/")
|
37
40
|
```
|
38
41
|
|
39
42
|
## Development
|
data/lib/maxwell/converter.rb
CHANGED
@@ -1,22 +1,45 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'httpclient'
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
client = HTTPClient.new(
|
8
|
-
default_header: {
|
9
|
-
"User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
|
10
|
-
}
|
11
|
-
)
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'capybara'
|
6
|
+
require 'capybara/poltergeist'
|
12
7
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
8
|
+
module Maxwell
|
9
|
+
class Converter
|
10
|
+
@user_agent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
|
11
|
+
class << self
|
12
|
+
def call(url, use_poltergeist=false)
|
13
|
+
use_poltergeist ? call_with_js(url) : call_without_js(url)
|
17
14
|
end
|
18
15
|
|
19
|
-
|
16
|
+
def call_without_js(url)
|
17
|
+
client = HTTPClient.new(
|
18
|
+
default_header: {
|
19
|
+
"User-Agent" => @user_agent
|
20
|
+
}
|
21
|
+
)
|
22
|
+
|
23
|
+
html = begin
|
24
|
+
client.get_content(url)
|
25
|
+
rescue
|
26
|
+
""
|
27
|
+
end
|
28
|
+
|
29
|
+
Nokogiri::HTML(html)
|
30
|
+
end
|
31
|
+
|
32
|
+
def call_with_js(url)
|
33
|
+
Capybara.register_driver :poltergeist do |app|
|
34
|
+
Capybara::Poltergeist::Driver.new(app, { js_errors: false, timeout: 1000 })
|
35
|
+
end
|
36
|
+
Capybara.default_selector = :xpath
|
37
|
+
session = Capybara::Session.new(:poltergeist)
|
38
|
+
|
39
|
+
session.driver.headers = { 'User-Agent' => @user_agent }
|
40
|
+
session.visit url
|
41
|
+
Nokogiri::HTML(session.html)
|
42
|
+
end
|
20
43
|
end
|
21
44
|
end
|
22
45
|
end
|
data/lib/maxwell/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = "0.
|
1
|
+
module Maxwell
|
2
|
+
VERSION = "0.3.0"
|
3
3
|
end
|
data/lib/maxwell.rb
CHANGED
@@ -1,19 +1,23 @@
|
|
1
1
|
require "maxwell/converter"
|
2
2
|
|
3
|
-
|
3
|
+
module Maxwell
|
4
4
|
class Base
|
5
5
|
class << self
|
6
6
|
def attr_scrape(*attr_scrapes)
|
7
7
|
@acquirer_class = Class.new do
|
8
8
|
attr_accessor *attr_scrapes
|
9
|
-
|
9
|
+
@attr_scrapes = attr_scrapes
|
10
|
+
|
11
|
+
def self.attr_scrapes
|
12
|
+
@attr_scrapes
|
13
|
+
end
|
10
14
|
|
11
15
|
def initialize(nokogiri_obj)
|
12
16
|
@html = nokogiri_obj
|
13
17
|
end
|
14
18
|
|
15
19
|
def result
|
16
|
-
|
20
|
+
self.class.attr_scrapes.map { |k| [k, send(k)] }.to_h
|
17
21
|
end
|
18
22
|
end
|
19
23
|
end
|
@@ -26,11 +30,15 @@ class Maxwell
|
|
26
30
|
def regist_handler(&handler_blk)
|
27
31
|
@handler_blk = handler_blk
|
28
32
|
end
|
33
|
+
|
34
|
+
def use_poltergeist(value)
|
35
|
+
@use_poltergeist = value
|
36
|
+
end
|
29
37
|
end
|
30
38
|
|
31
39
|
def execute(root_url)
|
32
40
|
if self.link_selectore
|
33
|
-
html = Maxwell::Converter.
|
41
|
+
html = Maxwell::Converter.call(root_url, use_poltergeist)
|
34
42
|
html.css(self.link_selectore).each do |a|
|
35
43
|
execute_for_result a[:href]
|
36
44
|
end
|
@@ -39,6 +47,10 @@ class Maxwell
|
|
39
47
|
end
|
40
48
|
end
|
41
49
|
|
50
|
+
def use_poltergeist
|
51
|
+
self.class.instance_eval("@use_poltergeist")
|
52
|
+
end
|
53
|
+
|
42
54
|
def link_selectore
|
43
55
|
self.class.instance_eval("@link_selectore")
|
44
56
|
end
|
@@ -57,7 +69,7 @@ class Maxwell
|
|
57
69
|
|
58
70
|
private
|
59
71
|
def execute_for_result(tip_url)
|
60
|
-
acquirer = acquirer_class.new(Maxwell::Converter.
|
72
|
+
acquirer = acquirer_class.new(Maxwell::Converter.call(tip_url, use_poltergeist))
|
61
73
|
acquirer.instance_eval &self.strategy_blk
|
62
74
|
|
63
75
|
acquirer.result.tap do |result|
|
data/maxwell.gemspec
CHANGED
@@ -21,6 +21,8 @@ Gem::Specification.new do |spec|
|
|
21
21
|
|
22
22
|
spec.add_dependency "nokogiri"
|
23
23
|
spec.add_dependency "httpclient"
|
24
|
+
spec.add_dependency 'poltergeist'
|
25
|
+
spec.add_dependency 'capybara'
|
24
26
|
|
25
27
|
spec.add_development_dependency "bundler", "~> 1.10"
|
26
28
|
spec.add_development_dependency "rake", "~> 10.0"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: maxwell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- gogotanaka
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -38,6 +38,34 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: poltergeist
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: capybara
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
41
69
|
- !ruby/object:Gem::Dependency
|
42
70
|
name: bundler
|
43
71
|
requirement: !ruby/object:Gem::Requirement
|