maxwell 0.4.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +0 -2
- data/README.md +39 -16
- data/exe/maxwell +0 -1
- data/lib/maxwell.rb +16 -10
- data/lib/maxwell/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 344d408e0e787f44fb1a559ea51d5a7810f67101
|
4
|
+
data.tar.gz: 6c9f7f9167bc6cb3f981be5ceda9335370172d8b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 12e76e792218e7116a1e3d55056831ec369919d6eaf4fb25b6ba262eb8f8ea329d2f870e755921b40887267d4576c43a4060f0dbce1b4ff75e0c2f7104e44ba7
|
7
|
+
data.tar.gz: 3e2ba946c4e3625c3a2149a778331e05bcc4f1a7f92708d3b46ae04cad34a0a4be7b05acde2274e62daca861dd83a047828e62aba57c0bd1620def294c668400
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -15,38 +15,61 @@ Or install it yourself as:
|
|
15
15
|
## Usage
|
16
16
|
|
17
17
|
```ruby
|
18
|
-
class
|
19
|
-
attr_accessor :title, :
|
18
|
+
class WikipediaScraper < Maxwell::Base
|
19
|
+
attr_accessor :title, :image_urls # attributes which you want to get
|
20
20
|
|
21
|
-
|
21
|
+
# You need to define 2 methods
|
22
|
+
# parser ... define how to parse attributes from html.
|
23
|
+
# handler ... define what to do with result which is come from parser.
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
@title = html.title
|
27
|
-
@url = html.css("td.sdhk jdj").text
|
28
|
-
@address = html.css("table tr.ddad").text
|
25
|
+
def parser(html) # html is Nokogiri::HTML::Document object
|
26
|
+
@title = html.css('title').text # Ruby - Wikipedia
|
27
|
+
@image_urls = html.css('img').map { |img| img[:src] } # ["//upload.wikimedia.org/wikipedia/commons/thumb/8/80/Ruby_-_Winza%2C_Tanzania.jpg/240px-Ruby_-_Winza%2C_Tanzania.jpg", ...]
|
29
28
|
end
|
30
29
|
|
31
|
-
def handler result
|
30
|
+
def handler(result) # result is Hash which contain parsed attributes
|
32
31
|
p result
|
33
|
-
#=> { title: "...", url: "...", address: "..." }
|
34
32
|
end
|
35
33
|
end
|
36
34
|
|
37
|
-
|
35
|
+
WikipediaScraper.execute urls: %w[https://en.wikipedia.org/wiki/Ruby]
|
36
|
+
|
37
|
+
# output is
|
38
|
+
# {
|
39
|
+
# :url => "https://en.wikipedia.org/wiki/Ruby",
|
40
|
+
# :title => "Ruby - Wikipedia",
|
41
|
+
# :image_urls => [
|
42
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/8/80/Ruby_-_Winza%2C_Tanzania.jpg/240px-Ruby_-_Winza%2C_Tanzania.jpg",
|
43
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/f/f0/Corundum.png/220px-Corundum.png",
|
44
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/b/b1/Ruby_transmittance.svg/220px-Ruby_transmittance.svg.png",
|
45
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Ruby_cristal.jpg/100px-Ruby_cristal.jpg",
|
46
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/f/f0/Ruby_gem.JPG/160px-Ruby_gem.JPG",
|
47
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/a/a3/Ruby_and_diamond_bracelet.jpg/160px-Ruby_and_diamond_bracelet.jpg",
|
48
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/c/c4/Cut_Ruby.jpg/158px-Cut_Ruby.jpg",
|
49
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/4/46/Artificial_ruby_hemisphere_under_a_normal_light.jpg/200px-Artificial_ruby_hemisphere_under_a_normal_light.jpg",
|
50
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/3/38/Artificial_ruby_hemisphere_under_a_monochromatic_light.jpg/200px-Artificial_ruby_hemisphere_under_a_monochromatic_light.jpg",
|
51
|
+
# "//upload.wikimedia.org/wikipedia/commons/thumb/1/12/NMNH-Rubies-CroppedRotated.png/220px-NMNH-Rubies-CroppedRotated.png",
|
52
|
+
# "//upload.wikimedia.org/wikipedia/en/thumb/4/4a/Commons-logo.svg/30px-Commons-logo.svg.png",
|
53
|
+
# "//en.wikipedia.org/wiki/Special:CentralAutoLogin/start?type=1x1",
|
54
|
+
# "/static/images/wikimedia-button.png",
|
55
|
+
# "/static/images/poweredby_mediawiki_88x31.png"
|
56
|
+
# ]
|
57
|
+
# }
|
38
58
|
```
|
39
59
|
|
40
60
|
## Development
|
41
61
|
|
42
|
-
|
62
|
+
`bin/setup` ... install dependencies.
|
43
63
|
|
44
|
-
|
64
|
+
`rake test` ... run the tests.
|
45
65
|
|
46
|
-
|
66
|
+
`bin/console` ... interactive prompt that will allow you to experiment.
|
47
67
|
|
48
|
-
|
68
|
+
`bundle exec rake install` ... install this gem onto your local machine.
|
69
|
+
|
70
|
+
## Contributing
|
49
71
|
|
72
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/gogotanaka/maxwell. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](contributor-covenant.org) code of conduct.
|
50
73
|
|
51
74
|
## License
|
52
75
|
|
data/exe/maxwell
CHANGED
data/lib/maxwell.rb
CHANGED
@@ -10,18 +10,24 @@ module Maxwell
|
|
10
10
|
|
11
11
|
class Base
|
12
12
|
class << self
|
13
|
-
def execute(
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
def execute(args)
|
14
|
+
if !args[:urls].nil?
|
15
|
+
urls = args[:urls]
|
16
|
+
Parallel.
|
17
|
+
map_with_index(urls, in_threads: @concurrency || 1) do |url, id|
|
18
|
+
puts "\e[34m[#{id + 1}] scraping: #{ url }\e[0m"
|
17
19
|
|
18
|
-
|
19
|
-
|
20
|
+
scraper = self.new
|
21
|
+
html = Maxwell::Converter.call(url, @use_poltergeist)
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
23
|
+
scraper.parser html
|
24
|
+
({ url: url }).merge(scraper.result).tap do |result_hash|
|
25
|
+
scraper.handler result_hash
|
26
|
+
end
|
27
|
+
end
|
28
|
+
else
|
29
|
+
raise 'You need pass an argument urls: or raw_htmls:'
|
30
|
+
end
|
25
31
|
end
|
26
32
|
|
27
33
|
def attr_accessor(*attrs)
|
data/lib/maxwell/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: maxwell
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- gogotanaka
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-01-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|