maxwell 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +13 -15
- data/lib/maxwell/converter.rb +3 -1
- data/lib/maxwell/helper.rb +8 -0
- data/lib/maxwell/version.rb +1 -1
- data/lib/maxwell.rb +29 -53
- data/maxwell.gemspec +1 -0
- metadata +18 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3062e2f1c6b55eb5cc7318da36e44986f1d73fd2
|
4
|
+
data.tar.gz: d95d0c7e51a9d0dfe383bca8d09c22b3c26431e6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1bc5e93ee5211c5c9a4c1a5ed6390e6969bba4912677b7ad8a9e015b81b5f9106b7c5bf8302ef9d673a8995f8bbce4ca0d9379d73668d13cb22a6598bd6a91f6
|
7
|
+
data.tar.gz: 7f0e764c5e8a6b8ac3e43ec03dc2efc2b199a61cc7fc5cbc77534496e2a3d52e49e66f7dbc1725963982991ed230323b48def1b4e6769eb017d091a50cb567c5
|
data/README.md
CHANGED
@@ -4,16 +4,10 @@ Maxwell makes web scraping more simpler and faster with Ruby.
|
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
7
|
-
Add this line to your application's Gemfile:
|
8
|
-
|
9
7
|
```ruby
|
10
|
-
gem 'maxwell'
|
8
|
+
echo "gem 'maxwell'" >> Gemfile && bundle
|
11
9
|
```
|
12
10
|
|
13
|
-
And then execute:
|
14
|
-
|
15
|
-
$ bundle
|
16
|
-
|
17
11
|
Or install it yourself as:
|
18
12
|
|
19
13
|
$ gem install maxwell
|
@@ -22,21 +16,25 @@ Or install it yourself as:
|
|
22
16
|
|
23
17
|
```ruby
|
24
18
|
class YahooScraper < Maxwell::Base
|
25
|
-
|
19
|
+
attr_accessor :title, :url, :address
|
26
20
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
21
|
+
javascript true
|
22
|
+
|
23
|
+
concurrency 4
|
24
|
+
|
25
|
+
def parser html
|
26
|
+
@title = html.title
|
27
|
+
@url = html.css("td.sdhk jdj").text
|
28
|
+
@address = html.css("table tr.ddad").text
|
31
29
|
end
|
32
30
|
|
33
|
-
|
31
|
+
def handler result
|
34
32
|
p result
|
33
|
+
#=> { title: "...", url: "...", address: "..." }
|
35
34
|
end
|
36
|
-
|
37
35
|
end
|
38
36
|
|
39
|
-
YahooScraper.
|
37
|
+
YahooScraper.execute ["https://www.yahoo.com/"]
|
40
38
|
```
|
41
39
|
|
42
40
|
## Development
|
data/lib/maxwell/converter.rb
CHANGED
data/lib/maxwell/version.rb
CHANGED
data/lib/maxwell.rb
CHANGED
@@ -1,86 +1,62 @@
|
|
1
|
+
require 'parallel'
|
2
|
+
|
1
3
|
require "maxwell/converter"
|
4
|
+
require "maxwell/helper"
|
2
5
|
|
3
6
|
module Maxwell
|
7
|
+
class NoParserDefinedErr; end
|
8
|
+
|
4
9
|
class Base
|
5
10
|
class << self
|
6
|
-
def
|
7
|
-
|
8
|
-
|
9
|
-
|
11
|
+
def execute(urls)
|
12
|
+
Parallel.
|
13
|
+
map_with_index(urls, in_threads: @concurrency || 1) do |url, id|
|
14
|
+
p "scraping: #{ id + 1 }"
|
10
15
|
|
11
|
-
|
12
|
-
@
|
13
|
-
end
|
16
|
+
scraper = self.new
|
17
|
+
html = Maxwell::Converter.call(url, @use_poltergeist)
|
14
18
|
|
15
|
-
|
16
|
-
@html = nokogiri_obj
|
17
|
-
end
|
19
|
+
scraper.parser html
|
18
20
|
|
19
|
-
|
20
|
-
self.class.attr_scrapes.map { |k| [k, send(k)] }.to_h
|
21
|
+
scraper.handler ({ id: id + 1 }).merge(scraper.result)
|
21
22
|
end
|
22
|
-
end
|
23
23
|
end
|
24
24
|
|
25
|
-
def
|
26
|
-
@
|
27
|
-
@
|
25
|
+
def attr_accessor(*attrs)
|
26
|
+
@attrs ||= []
|
27
|
+
@attrs.concat attrs
|
28
|
+
super
|
28
29
|
end
|
29
30
|
|
30
|
-
def
|
31
|
-
@
|
31
|
+
def attrs
|
32
|
+
@attrs
|
32
33
|
end
|
33
34
|
|
34
|
-
def
|
35
|
+
def javascript(value)
|
35
36
|
@use_poltergeist = value
|
36
37
|
end
|
37
|
-
end
|
38
38
|
|
39
|
-
|
40
|
-
|
41
|
-
html = Maxwell::Converter.call(root_url, use_poltergeist)
|
42
|
-
html.css(self.link_selectore).each do |a|
|
43
|
-
execute_for_result a[:href]
|
44
|
-
end
|
45
|
-
else
|
46
|
-
execute_for_result root_url
|
39
|
+
def concurrency(value)
|
40
|
+
@concurrency = value
|
47
41
|
end
|
48
42
|
end
|
49
43
|
|
50
|
-
def
|
51
|
-
self
|
52
|
-
end
|
53
|
-
|
54
|
-
def link_selectore
|
55
|
-
self.class.instance_eval("@link_selectore")
|
56
|
-
end
|
57
|
-
|
58
|
-
def strategy_blk
|
59
|
-
self.class.instance_eval("@strategy_blk")
|
44
|
+
def parser html
|
45
|
+
raise NoParserDefinedErr "You need to define #{self}#parser"
|
60
46
|
end
|
61
47
|
|
62
|
-
def
|
63
|
-
|
48
|
+
def handler result
|
49
|
+
p result
|
64
50
|
end
|
65
51
|
|
66
|
-
def
|
67
|
-
self.class.
|
52
|
+
def result
|
53
|
+
self.class.attrs.map { |k| [k, self.send(k)] }.to_h
|
68
54
|
end
|
69
|
-
|
70
|
-
private
|
71
|
-
def execute_for_result(tip_url)
|
72
|
-
acquirer = acquirer_class.new(Maxwell::Converter.call(tip_url, use_poltergeist))
|
73
|
-
acquirer.instance_eval &self.strategy_blk
|
74
|
-
|
75
|
-
acquirer.result.tap do |result|
|
76
|
-
self.handler_blk.call(result) if self.handler_blk
|
77
|
-
end
|
78
|
-
end
|
79
55
|
end
|
80
56
|
end
|
81
57
|
|
82
58
|
class ::String
|
83
59
|
def trim
|
84
|
-
delete
|
60
|
+
delete "\r\n\t"
|
85
61
|
end
|
86
62
|
end
|
data/maxwell.gemspec
CHANGED
@@ -23,6 +23,7 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_dependency "httpclient"
|
24
24
|
spec.add_dependency 'poltergeist'
|
25
25
|
spec.add_dependency 'capybara'
|
26
|
+
spec.add_dependency 'parallel'
|
26
27
|
|
27
28
|
spec.add_development_dependency "bundler", "~> 1.10"
|
28
29
|
spec.add_development_dependency "rake", "~> 10.0"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: maxwell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- gogotanaka
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-03-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: parallel
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: bundler
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,6 +142,7 @@ files:
|
|
128
142
|
- exe/maxwell
|
129
143
|
- lib/maxwell.rb
|
130
144
|
- lib/maxwell/converter.rb
|
145
|
+
- lib/maxwell/helper.rb
|
131
146
|
- lib/maxwell/version.rb
|
132
147
|
- maxwell.gemspec
|
133
148
|
homepage: http://gogotanaka.me/
|
@@ -150,7 +165,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
150
165
|
version: '0'
|
151
166
|
requirements: []
|
152
167
|
rubyforge_project:
|
153
|
-
rubygems_version: 2.5.1
|
168
|
+
rubygems_version: 2.4.5.1
|
154
169
|
signing_key:
|
155
170
|
specification_version: 4
|
156
171
|
summary: Maxwell makes web scraping more simpler and faster with Ruby.
|