maxwell 0.4.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 099c077db029111f4ab30b731e75395c110b9a69
4
- data.tar.gz: 8ad88ceda71a2735edb3125737c404af5e34c973
3
+ metadata.gz: 344d408e0e787f44fb1a559ea51d5a7810f67101
4
+ data.tar.gz: 6c9f7f9167bc6cb3f981be5ceda9335370172d8b
5
5
  SHA512:
6
- metadata.gz: 69e93a319d297f2df1d14671737c129e9eb22aa901a4073b2f3130381ca672b92f49ddb65a213d8fef5d318be2159ee99dd2d3f4c7175ff04305015518bd713b
7
- data.tar.gz: 701875498b09eb445e0c5898492cff0a22000525ee0c62211a7535c78cc3e7179efcdada2982b2ee5b4e55e6189b46a9bc7cc6aa2a9323f0968b53e5364358f6
6
+ metadata.gz: 12e76e792218e7116a1e3d55056831ec369919d6eaf4fb25b6ba262eb8f8ea329d2f870e755921b40887267d4576c43a4060f0dbce1b4ff75e0c2f7104e44ba7
7
+ data.tar.gz: 3e2ba946c4e3625c3a2149a778331e05bcc4f1a7f92708d3b46ae04cad34a0a4be7b05acde2274e62daca861dd83a047828e62aba57c0bd1620def294c668400
data/Gemfile CHANGED
@@ -1,5 +1,3 @@
1
1
  source 'https://rubygems.org'
2
-
3
2
  gem "pry"
4
-
5
3
  gemspec
data/README.md CHANGED
@@ -15,38 +15,61 @@ Or install it yourself as:
15
15
  ## Usage
16
16
 
17
17
  ```ruby
18
- class YahooScraper < Maxwell::Base
19
- attr_accessor :title, :url, :address
18
+ class WikipediaScraper < Maxwell::Base
19
+ attr_accessor :title, :image_urls # attributes which you want to get
20
20
 
21
- javascript true
21
+ # You need to define 2 methods
22
+ # parser ... define how to parse attributes from html.
23
+ # handler ... define what to do with result which is come from parser.
22
24
 
23
- concurrency 4
24
-
25
- def parser html
26
- @title = html.title
27
- @url = html.css("td.sdhk jdj").text
28
- @address = html.css("table tr.ddad").text
25
+ def parser(html) # html is Nokogiri::HTML::Document object
26
+ @title = html.css('title').text # Ruby - Wikipedia
27
+ @image_urls = html.css('img').map { |img| img[:src] } # ["//upload.wikimedia.org/wikipedia/commons/thumb/8/80/Ruby_-_Winza%2C_Tanzania.jpg/240px-Ruby_-_Winza%2C_Tanzania.jpg", ...]
29
28
  end
30
29
 
31
- def handler result
30
+ def handler(result) # result is Hash which contain parsed attributes
32
31
  p result
33
- #=> { title: "...", url: "...", address: "..." }
34
32
  end
35
33
  end
36
34
 
37
- YahooScraper.execute ["https://www.yahoo.com/"]
35
+ WikipediaScraper.execute urls: %w[https://en.wikipedia.org/wiki/Ruby]
36
+
37
+ # output is
38
+ # {
39
+ # :url => "https://en.wikipedia.org/wiki/Ruby",
40
+ # :title => "Ruby - Wikipedia",
41
+ # :image_urls => [
42
+ # "//upload.wikimedia.org/wikipedia/commons/thumb/8/80/Ruby_-_Winza%2C_Tanzania.jpg/240px-Ruby_-_Winza%2C_Tanzania.jpg",
43
+ # "//upload.wikimedia.org/wikipedia/commons/thumb/f/f0/Corundum.png/220px-Corundum.png",
44
+ # "//upload.wikimedia.org/wikipedia/commons/thumb/b/b1/Ruby_transmittance.svg/220px-Ruby_transmittance.svg.png",
45
+ # "//upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Ruby_cristal.jpg/100px-Ruby_cristal.jpg",
46
+ # "//upload.wikimedia.org/wikipedia/commons/thumb/f/f0/Ruby_gem.JPG/160px-Ruby_gem.JPG",
47
+ # "//upload.wikimedia.org/wikipedia/commons/thumb/a/a3/Ruby_and_diamond_bracelet.jpg/160px-Ruby_and_diamond_bracelet.jpg",
48
+ # "//upload.wikimedia.org/wikipedia/commons/thumb/c/c4/Cut_Ruby.jpg/158px-Cut_Ruby.jpg",
49
+ # "//upload.wikimedia.org/wikipedia/commons/thumb/4/46/Artificial_ruby_hemisphere_under_a_normal_light.jpg/200px-Artificial_ruby_hemisphere_under_a_normal_light.jpg",
50
+ # "//upload.wikimedia.org/wikipedia/commons/thumb/3/38/Artificial_ruby_hemisphere_under_a_monochromatic_light.jpg/200px-Artificial_ruby_hemisphere_under_a_monochromatic_light.jpg",
51
+ # "//upload.wikimedia.org/wikipedia/commons/thumb/1/12/NMNH-Rubies-CroppedRotated.png/220px-NMNH-Rubies-CroppedRotated.png",
52
+ # "//upload.wikimedia.org/wikipedia/en/thumb/4/4a/Commons-logo.svg/30px-Commons-logo.svg.png",
53
+ # "//en.wikipedia.org/wiki/Special:CentralAutoLogin/start?type=1x1",
54
+ # "/static/images/wikimedia-button.png",
55
+ # "/static/images/poweredby_mediawiki_88x31.png"
56
+ # ]
57
+ # }
38
58
  ```
39
59
 
40
60
  ## Development
41
61
 
42
- After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment. Run `bundle exec maxwell` to use the gem in this directory, ignoring other installed copies of this gem.
62
+ `bin/setup` ... install dependencies.
43
63
 
44
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
64
+ `rake test` ... run the tests.
45
65
 
46
- ## Contributing
66
+ `bin/console` ... interactive prompt that will allow you to experiment.
47
67
 
48
- Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/maxwell. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](contributor-covenant.org) code of conduct.
68
+ `bundle exec rake install` ... install this gem onto your local machine.
69
+
70
+ ## Contributing
49
71
 
72
+ Bug reports and pull requests are welcome on GitHub at https://github.com/gogotanaka/maxwell. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](contributor-covenant.org) code of conduct.
50
73
 
51
74
  ## License
52
75
 
@@ -1,3 +1,2 @@
1
1
  #!/usr/bin/env ruby
2
-
3
2
  require "maxwell"
@@ -10,18 +10,24 @@ module Maxwell
10
10
 
11
11
  class Base
12
12
  class << self
13
- def execute(urls)
14
- Parallel.
15
- map_with_index(urls, in_threads: @concurrency || 1) do |url, id|
16
- p "scraping: #{ id + 1 }"
13
+ def execute(args)
14
+ if !args[:urls].nil?
15
+ urls = args[:urls]
16
+ Parallel.
17
+ map_with_index(urls, in_threads: @concurrency || 1) do |url, id|
18
+ puts "\e[34m[#{id + 1}] scraping: #{ url }\e[0m"
17
19
 
18
- scraper = self.new
19
- html = Maxwell::Converter.call(url, @use_poltergeist)
20
+ scraper = self.new
21
+ html = Maxwell::Converter.call(url, @use_poltergeist)
20
22
 
21
- scraper.parser html
22
-
23
- scraper.handler ({ url: url }).merge(scraper.result)
24
- end
23
+ scraper.parser html
24
+ ({ url: url }).merge(scraper.result).tap do |result_hash|
25
+ scraper.handler result_hash
26
+ end
27
+ end
28
+ else
29
+ raise 'You need pass an argument urls: or raw_htmls:'
30
+ end
25
31
  end
26
32
 
27
33
  def attr_accessor(*attrs)
@@ -1,3 +1,3 @@
1
1
  module Maxwell
2
- VERSION = "0.4.3"
2
+ VERSION = "1.0.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: maxwell
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - gogotanaka
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-03-17 00:00:00.000000000 Z
11
+ date: 2017-01-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri