maxwell 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +15 -12
- data/lib/maxwell/converter.rb +36 -13
- data/lib/maxwell/version.rb +2 -2
- data/lib/maxwell.rb +17 -5
- data/maxwell.gemspec +2 -0
- metadata +30 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0b4a07c10e6638f009e5d96fa34d98d2c92985aa
|
4
|
+
data.tar.gz: 641ac72166235db4b28e7ccebee77e8da1b7c3e0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 04d92bce282bbbf263c83b1434f05cbd9043a169bf893a360a80701755d645aef03c60af4a69187ac763c8a1fa95e1596854d175fd0aaa6d66c787c3eab36081
|
7
|
+
data.tar.gz: 8cfb57804bc5575573a30cb10166cf46b8c6761969ca8bbdef1accd758c16d0823a9c6c203103bf45a244303f9c54f8c26ff1f709425d3c944e7ec7767e7819b
|
data/README.md
CHANGED
@@ -21,19 +21,22 @@ Or install it yourself as:
|
|
21
21
|
## Usage
|
22
22
|
|
23
23
|
```ruby
|
24
|
-
Maxwell::
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
24
|
+
class YahooScraper < Maxwell::Base
|
25
|
+
attr_scrape :title, :url, :address
|
26
|
+
|
27
|
+
regist_strategy("h3.slcHead.cFix a") do
|
28
|
+
@title = @html.title
|
29
|
+
@url = @html.css("td.sdhk jdj").text
|
30
|
+
@address = @html.css("table tr.ddad").text
|
31
|
+
end
|
32
|
+
|
33
|
+
regist_handler do |result|
|
34
|
+
p result
|
35
|
+
end
|
36
|
+
|
36
37
|
end
|
38
|
+
|
39
|
+
YahooScraper.new.execute("https://www.yahoo.com/")
|
37
40
|
```
|
38
41
|
|
39
42
|
## Development
|
data/lib/maxwell/converter.rb
CHANGED
@@ -1,22 +1,45 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'httpclient'
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
client = HTTPClient.new(
|
8
|
-
default_header: {
|
9
|
-
"User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
|
10
|
-
}
|
11
|
-
)
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'capybara'
|
6
|
+
require 'capybara/poltergeist'
|
12
7
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
8
|
+
module Maxwell
|
9
|
+
class Converter
|
10
|
+
@user_agent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
|
11
|
+
class << self
|
12
|
+
def call(url, use_poltergeist=false)
|
13
|
+
use_poltergeist ? call_with_js(url) : call_without_js(url)
|
17
14
|
end
|
18
15
|
|
19
|
-
|
16
|
+
def call_without_js(url)
|
17
|
+
client = HTTPClient.new(
|
18
|
+
default_header: {
|
19
|
+
"User-Agent" => @user_agent
|
20
|
+
}
|
21
|
+
)
|
22
|
+
|
23
|
+
html = begin
|
24
|
+
client.get_content(url)
|
25
|
+
rescue
|
26
|
+
""
|
27
|
+
end
|
28
|
+
|
29
|
+
Nokogiri::HTML(html)
|
30
|
+
end
|
31
|
+
|
32
|
+
def call_with_js(url)
|
33
|
+
Capybara.register_driver :poltergeist do |app|
|
34
|
+
Capybara::Poltergeist::Driver.new(app, { js_errors: false, timeout: 1000 })
|
35
|
+
end
|
36
|
+
Capybara.default_selector = :xpath
|
37
|
+
session = Capybara::Session.new(:poltergeist)
|
38
|
+
|
39
|
+
session.driver.headers = { 'User-Agent' => @user_agent }
|
40
|
+
session.visit url
|
41
|
+
Nokogiri::HTML(session.html)
|
42
|
+
end
|
20
43
|
end
|
21
44
|
end
|
22
45
|
end
|
data/lib/maxwell/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = "0.
|
1
|
+
module Maxwell
|
2
|
+
VERSION = "0.3.0"
|
3
3
|
end
|
data/lib/maxwell.rb
CHANGED
@@ -1,19 +1,23 @@
|
|
1
1
|
require "maxwell/converter"
|
2
2
|
|
3
|
-
|
3
|
+
module Maxwell
|
4
4
|
class Base
|
5
5
|
class << self
|
6
6
|
def attr_scrape(*attr_scrapes)
|
7
7
|
@acquirer_class = Class.new do
|
8
8
|
attr_accessor *attr_scrapes
|
9
|
-
|
9
|
+
@attr_scrapes = attr_scrapes
|
10
|
+
|
11
|
+
def self.attr_scrapes
|
12
|
+
@attr_scrapes
|
13
|
+
end
|
10
14
|
|
11
15
|
def initialize(nokogiri_obj)
|
12
16
|
@html = nokogiri_obj
|
13
17
|
end
|
14
18
|
|
15
19
|
def result
|
16
|
-
|
20
|
+
self.class.attr_scrapes.map { |k| [k, send(k)] }.to_h
|
17
21
|
end
|
18
22
|
end
|
19
23
|
end
|
@@ -26,11 +30,15 @@ class Maxwell
|
|
26
30
|
def regist_handler(&handler_blk)
|
27
31
|
@handler_blk = handler_blk
|
28
32
|
end
|
33
|
+
|
34
|
+
def use_poltergeist(value)
|
35
|
+
@use_poltergeist = value
|
36
|
+
end
|
29
37
|
end
|
30
38
|
|
31
39
|
def execute(root_url)
|
32
40
|
if self.link_selectore
|
33
|
-
html = Maxwell::Converter.
|
41
|
+
html = Maxwell::Converter.call(root_url, use_poltergeist)
|
34
42
|
html.css(self.link_selectore).each do |a|
|
35
43
|
execute_for_result a[:href]
|
36
44
|
end
|
@@ -39,6 +47,10 @@ class Maxwell
|
|
39
47
|
end
|
40
48
|
end
|
41
49
|
|
50
|
+
def use_poltergeist
|
51
|
+
self.class.instance_eval("@use_poltergeist")
|
52
|
+
end
|
53
|
+
|
42
54
|
def link_selectore
|
43
55
|
self.class.instance_eval("@link_selectore")
|
44
56
|
end
|
@@ -57,7 +69,7 @@ class Maxwell
|
|
57
69
|
|
58
70
|
private
|
59
71
|
def execute_for_result(tip_url)
|
60
|
-
acquirer = acquirer_class.new(Maxwell::Converter.
|
72
|
+
acquirer = acquirer_class.new(Maxwell::Converter.call(tip_url, use_poltergeist))
|
61
73
|
acquirer.instance_eval &self.strategy_blk
|
62
74
|
|
63
75
|
acquirer.result.tap do |result|
|
data/maxwell.gemspec
CHANGED
@@ -21,6 +21,8 @@ Gem::Specification.new do |spec|
|
|
21
21
|
|
22
22
|
spec.add_dependency "nokogiri"
|
23
23
|
spec.add_dependency "httpclient"
|
24
|
+
spec.add_dependency 'poltergeist'
|
25
|
+
spec.add_dependency 'capybara'
|
24
26
|
|
25
27
|
spec.add_development_dependency "bundler", "~> 1.10"
|
26
28
|
spec.add_development_dependency "rake", "~> 10.0"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: maxwell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- gogotanaka
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -38,6 +38,34 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: poltergeist
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: capybara
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
41
69
|
- !ruby/object:Gem::Dependency
|
42
70
|
name: bundler
|
43
71
|
requirement: !ruby/object:Gem::Requirement
|