maxwell 0.4.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -2
- data/README.md +39 -16
- data/exe/maxwell +0 -1
- data/lib/maxwell.rb +16 -10
- data/lib/maxwell/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 344d408e0e787f44fb1a559ea51d5a7810f67101
|
4
|
+
data.tar.gz: 6c9f7f9167bc6cb3f981be5ceda9335370172d8b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 12e76e792218e7116a1e3d55056831ec369919d6eaf4fb25b6ba262eb8f8ea329d2f870e755921b40887267d4576c43a4060f0dbce1b4ff75e0c2f7104e44ba7
|
7
|
+
data.tar.gz: 3e2ba946c4e3625c3a2149a778331e05bcc4f1a7f92708d3b46ae04cad34a0a4be7b05acde2274e62daca861dd83a047828e62aba57c0bd1620def294c668400
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -15,38 +15,61 @@ Or install it yourself as:
|
|
15
15
|
## Usage
|
16
16
|
|
17
17
|
```ruby
|
18
|
-
class
|
19
|
-
attr_accessor :title, :
|
18
|
+
class WikipediaScraper < Maxwell::Base
|
19
|
+
attr_accessor :title, :image_urls # attributes which you want to get
|
20
20
|
|
21
|
-
|
21
|
+
# You need to define 2 methods
|
22
|
+
# parser ... define how to parse attributes from html.
|
23
|
+
# handler ... define what to do with result which is come from parser.
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
@title = html.title
|
27
|
-
@url = html.css("td.sdhk jdj").text
|
28
|
-
@address = html.css("table tr.ddad").text
|
25
|
+
def parser(html) # html is Nokogiri::HTML::Document object
|
26
|
+
@title = html.css('title').text # Ruby - Wikipedia
|
27
|
+
@image_urls = html.css('img').map { |img| img[:src] } # ["//upload.wikimedia.org/wikipedia/commons/thumb/8/80/Ruby_-_Winza%2C_Tanzania.jpg/240px-Ruby_-_Winza%2C_Tanzania.jpg", ...]
|
29
28
|
end
|
30
29
|
|
31
|
-
def handler result
|
30
|
+
def handler(result) # result is Hash which contain parsed attributes
|
32
31
|
p result
|
33
|
-
#=> { title: "...", url: "...", address: "..." }
|
34
32
|
end
|
35
33
|
end
|
36
34
|
|
37
|
-
|
35
|
+
WikipediaScraper.execute urls: %w[https://en.wikipedia.org/wiki/Ruby]
|
36
|
+
|
37
|
+
# output is
|
38
|
+
# {
|
39
|
+
# :url => "https://en.wikipedia.org/wiki/Ruby",
|
40
|
+
# :title => "Ruby - Wikipedia",
|
41
|
+
# :image_urls => [
|
42
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/8/80/Ruby_-_Winza%2C_Tanzania.jpg/240px-Ruby_-_Winza%2C_Tanzania.jpg",
|
43
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/f/f0/Corundum.png/220px-Corundum.png",
|
44
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/b/b1/Ruby_transmittance.svg/220px-Ruby_transmittance.svg.png",
|
45
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Ruby_cristal.jpg/100px-Ruby_cristal.jpg",
|
46
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/f/f0/Ruby_gem.JPG/160px-Ruby_gem.JPG",
|
47
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/a/a3/Ruby_and_diamond_bracelet.jpg/160px-Ruby_and_diamond_bracelet.jpg",
|
48
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/c/c4/Cut_Ruby.jpg/158px-Cut_Ruby.jpg",
|
49
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/4/46/Artificial_ruby_hemisphere_under_a_normal_light.jpg/200px-Artificial_ruby_hemisphere_under_a_normal_light.jpg",
|
50
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/3/38/Artificial_ruby_hemisphere_under_a_monochromatic_light.jpg/200px-Artificial_ruby_hemisphere_under_a_monochromatic_light.jpg",
|
51
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/1/12/NMNH-Rubies-CroppedRotated.png/220px-NMNH-Rubies-CroppedRotated.png",
|
52
|
+
# "//upload.wikimedia.org/wikipedia/en/thumb/4/4a/Commons-logo.svg/30px-Commons-logo.svg.png",
|
53
|
+
# "//en.wikipedia.org/wiki/Special:CentralAutoLogin/start?type=1x1",
|
54
|
+
# "/static/images/wikimedia-button.png",
|
55
|
+
# "/static/images/poweredby_mediawiki_88x31.png"
|
56
|
+
# ]
|
57
|
+
# }
|
38
58
|
```
|
39
59
|
|
40
60
|
## Development
|
41
61
|
|
42
|
-
|
62
|
+
`bin/setup` ... install dependencies.
|
43
63
|
|
44
|
-
|
64
|
+
`rake test` ... run the tests.
|
45
65
|
|
46
|
-
|
66
|
+
`bin/console` ... interactive prompt that will allow you to experiment.
|
47
67
|
|
48
|
-
|
68
|
+
`bundle exec rake install` ... install this gem onto your local machine.
|
69
|
+
|
70
|
+
## Contributing
|
49
71
|
|
72
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/gogotanaka/maxwell. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](contributor-covenant.org) code of conduct.
|
50
73
|
|
51
74
|
## License
|
52
75
|
|
data/exe/maxwell
CHANGED
data/lib/maxwell.rb
CHANGED
@@ -10,18 +10,24 @@ module Maxwell
|
|
10
10
|
|
11
11
|
class Base
|
12
12
|
class << self
|
13
|
-
def execute(
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
def execute(args)
|
14
|
+
if !args[:urls].nil?
|
15
|
+
urls = args[:urls]
|
16
|
+
Parallel.
|
17
|
+
map_with_index(urls, in_threads: @concurrency || 1) do |url, id|
|
18
|
+
puts "\e[34m[#{id + 1}] scraping: #{ url }\e[0m"
|
17
19
|
|
18
|
-
|
19
|
-
|
20
|
+
scraper = self.new
|
21
|
+
html = Maxwell::Converter.call(url, @use_poltergeist)
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
23
|
+
scraper.parser html
|
24
|
+
({ url: url }).merge(scraper.result).tap do |result_hash|
|
25
|
+
scraper.handler result_hash
|
26
|
+
end
|
27
|
+
end
|
28
|
+
else
|
29
|
+
raise 'You need pass an argument urls: or raw_htmls:'
|
30
|
+
end
|
25
31
|
end
|
26
32
|
|
27
33
|
def attr_accessor(*attrs)
|
data/lib/maxwell/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: maxwell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- gogotanaka
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-01-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|