wcrawler 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.md +20 -0
- data/README.md +5 -0
- data/bin/wcrawler +6 -0
- data/lib/version.rb +3 -0
- data/lib/wcrawler/wei_disk_crawler.rb +87 -0
- data/lib/wei_disk_crawler_cli.rb +30 -0
- data/wcrawler.gemspec +20 -0
- metadata +65 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 3edfb127df16c0d081013257c89526ddf3c80b91
|
4
|
+
data.tar.gz: 972479202985b4f9fea112ea9ebb881a15cfa6e1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 16df66e728189400c2488238913fb8026957eac78fcd96e7e5cf195978d3108af6aa10ff40e459ee206860608577ce204fc6eb549359e33f60f7f80b3cdb33e4
|
7
|
+
data.tar.gz: 099de1e9c2e6362cab78212a13759832e177f560c047b18d99318e2746fd6bfa83a6a6de0f1f5dcdb83dd1b22b0c36599bd392bb173827804cca86757f9e88bd
|
data/LICENSE.md
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 J.K et al.
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
data/bin/wcrawler
ADDED
data/lib/version.rb
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'typhoeus'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
class WeiDiskCrawler
|
7
|
+
|
8
|
+
WEI_DISK_PREFIX = "http://vdisk.weibo.com/u/"
|
9
|
+
|
10
|
+
def initialize(params = {})
|
11
|
+
raise "Missing User ID" if params[:uid].nil?
|
12
|
+
@uid = params[:uid]
|
13
|
+
@resources = {}
|
14
|
+
@page_total = 0
|
15
|
+
end
|
16
|
+
|
17
|
+
def max_page_number
|
18
|
+
return @page_total unless @page_total.eql? 0
|
19
|
+
request = Typhoeus::Request.new(WEI_DISK_PREFIX + "#{@uid}")
|
20
|
+
request.run
|
21
|
+
response_body = Nokogiri::HTML(request.response.response_body)
|
22
|
+
return @page_total = 1 if response_body.css('.vd_page_main .vd_page').empty?
|
23
|
+
@page_total = response_body.css('.vd_page a:nth-last-child(2)').text.to_i
|
24
|
+
end
|
25
|
+
|
26
|
+
def list_all_resources
|
27
|
+
(1..max_page_number).each do |page_number|
|
28
|
+
@resources = @resources.merge(list_resources_on_page page_number)
|
29
|
+
end
|
30
|
+
@resources
|
31
|
+
end
|
32
|
+
|
33
|
+
def list_resources_on_page page_number
|
34
|
+
hydra = Typhoeus::Hydra.new
|
35
|
+
|
36
|
+
page_request = Typhoeus::Request.new(WEI_DISK_PREFIX + "#{@uid}" + "?page=#{page_number}")
|
37
|
+
|
38
|
+
page_request.on_complete do |response|
|
39
|
+
page_doc = Nokogiri::HTML(response.response_body)
|
40
|
+
|
41
|
+
resource_urls = page_doc.css('td.sort_name_m div.sort_name_pic a').map { |x| x.attr('href') }
|
42
|
+
resource_ids = resource_urls.map { |url| url.split(/\//).last }
|
43
|
+
|
44
|
+
requests = resource_ids.map do |resource_id|
|
45
|
+
now_timestamp = Time.now.to_datetime.strftime '%Q'
|
46
|
+
request = Typhoeus::Request.new(
|
47
|
+
"http://vdisk.weibo.com/api/weipan/fileopsStatCount?link=#{resource_id}&ops=download&_=#{now_timestamp}",
|
48
|
+
:method => :get,
|
49
|
+
headers: {
|
50
|
+
:Accept => "application/json, text/javascript, */*; q=0.01",
|
51
|
+
:Referer => "http://vdisk.weibo.com/s/#{resource_id}",
|
52
|
+
:'X-Requested-With' => "XMLHttpRequest",
|
53
|
+
:'Connection' => "keep-alive",
|
54
|
+
:'x-response-version' => "2"
|
55
|
+
}
|
56
|
+
)
|
57
|
+
request.on_complete do |ajax_response|
|
58
|
+
p "#{JSON.parse(ajax_response.response_body)['name']} has been parsed successfully."
|
59
|
+
end
|
60
|
+
|
61
|
+
hydra.queue request
|
62
|
+
request
|
63
|
+
end
|
64
|
+
|
65
|
+
hydra.run
|
66
|
+
|
67
|
+
requests.each do |request|
|
68
|
+
json = JSON.parse(request.response.response_body)
|
69
|
+
@resources[json["name"]] = json["download_list"].first
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
hydra.queue page_request
|
75
|
+
hydra.run
|
76
|
+
p "#"*50
|
77
|
+
p "Page #{page_number} has been parsed successfully."
|
78
|
+
p "#"*50
|
79
|
+
@resources
|
80
|
+
end
|
81
|
+
|
82
|
+
def write_resources_to_file file_name
|
83
|
+
File.open("downloads/#{file_name}", 'a') { |file| file.write(list_all_resources) }
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'wcrawler/wei_disk_crawler'
|
2
|
+
require 'thor'
|
3
|
+
|
4
|
+
class WeiDiskCrawlerCLI < Thor
|
5
|
+
desc "crawl UID", "crawl resources from UID"
|
6
|
+
option :uid, :required => true
|
7
|
+
def crawl
|
8
|
+
crawler = WeiDiskCrawler.new({:uid => options[:uid]})
|
9
|
+
crawler.list_all_resources
|
10
|
+
end
|
11
|
+
|
12
|
+
desc "list <page-number>", "list resources from all pages / specific page number"
|
13
|
+
option :uid, :required => true
|
14
|
+
def list(page_number=nil)
|
15
|
+
crawler = WeiDiskCrawler.new({:uid => options[:uid]})
|
16
|
+
if page_number
|
17
|
+
p crawler.list_resources_on_page page_number
|
18
|
+
else
|
19
|
+
p crawler.list_all_resources
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
desc "max_page_num", "list max page number"
|
24
|
+
option :uid, :required => true
|
25
|
+
def max_page_num
|
26
|
+
crawler = WeiDiskCrawler.new({:uid => options[:uid]})
|
27
|
+
p "There's #{crawler.max_page_number} pages in #{options[:uid]}'s resource"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
data/wcrawler.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path("../lib/", __FILE__)
|
3
|
+
$LOAD_PATH.unshift lib unless $LOAD_PATH.include?(lib)
|
4
|
+
require "version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.add_development_dependency "bundler", "~> 1.0"
|
8
|
+
spec.authors = ["J.K"]
|
9
|
+
spec.summary = "Downloader Gem No.1"
|
10
|
+
spec.description = "This is a tool to provide a easy way to download the resources from weipan"
|
11
|
+
spec.email = "jiukunz@gmail.com"
|
12
|
+
spec.executables = %w[wcrawler]
|
13
|
+
spec.files = %w[wcrawler.gemspec] + Dir['*.md', 'bin/*', 'lib/**/*.rb']
|
14
|
+
spec.licenses = %w[MIT]
|
15
|
+
spec.name = "wcrawler"
|
16
|
+
spec.homepage = "http://jiukunz.github.io/"
|
17
|
+
spec.require_paths = %w[lib]
|
18
|
+
spec.required_rubygems_version = ">= 1.3.5"
|
19
|
+
spec.version = WeiDiskCrawler::VERSION
|
20
|
+
end
|
metadata
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wcrawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- J.K
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-02-04 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.0'
|
27
|
+
description: This is a tool to provide a easy way to download the resources from weipan
|
28
|
+
email: jiukunz@gmail.com
|
29
|
+
executables:
|
30
|
+
- wcrawler
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- LICENSE.md
|
35
|
+
- README.md
|
36
|
+
- bin/wcrawler
|
37
|
+
- lib/version.rb
|
38
|
+
- lib/wcrawler/wei_disk_crawler.rb
|
39
|
+
- lib/wei_disk_crawler_cli.rb
|
40
|
+
- wcrawler.gemspec
|
41
|
+
homepage: http://jiukunz.github.io/
|
42
|
+
licenses:
|
43
|
+
- MIT
|
44
|
+
metadata: {}
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
require_paths:
|
48
|
+
- lib
|
49
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: 1.3.5
|
59
|
+
requirements: []
|
60
|
+
rubyforge_project:
|
61
|
+
rubygems_version: 2.4.5
|
62
|
+
signing_key:
|
63
|
+
specification_version: 4
|
64
|
+
summary: Downloader Gem No.1
|
65
|
+
test_files: []
|