newrank 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/newrank.rb +139 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: eb1cbfb901aa3e8dc9117fc5df14b111596742ee
|
4
|
+
data.tar.gz: a95c42da710abe8ca57b132d9cc1365c1b393ef5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3971adcf2000608d38b9edd52179fe53637ab57ff6d9684569caf643a2324c65ec127c5afa07e1bc2aacee14af9a54a25c0c06f4b4ba1b886f9c2b6519552354
|
7
|
+
data.tar.gz: 1bcdf263496f182b74c3c47c843882547ebe64f9e49f39fca2b2fe18fafc5b2778049558bed8f8d3460949c8eaa96c67022e6c3030e7c80afe5e98b38b4e2e75
|
data/lib/newrank.rb
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'open-uri'
|
3
|
+
require 'rkelly'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'json'
|
6
|
+
require 'v8'
|
7
|
+
require 'execjs'
|
8
|
+
require 'rest-client'
|
9
|
+
|
10
|
+
class Newrank
|
11
|
+
# crawl newrank info
|
12
|
+
def crawl(newrank_id)
|
13
|
+
doc = document(newrank_id.gsub("\u{a0}",""))
|
14
|
+
if !doc.nil?
|
15
|
+
score, uuid = score_and_uuid(doc)
|
16
|
+
|
17
|
+
element = doc.css(".detail-fans-counts")[0]
|
18
|
+
active_users_count = element.nil? ? 0 : element.text.gsub(",","").to_i
|
19
|
+
|
20
|
+
element = doc.css(".info-detail-head-weixin-fun-introduce")[0]
|
21
|
+
introduce = element.nil? ? "" : element.text
|
22
|
+
|
23
|
+
week_data = week_data(doc)
|
24
|
+
if !uuid.nil?
|
25
|
+
posts_data = fetch_post(uuid)
|
26
|
+
end
|
27
|
+
{
|
28
|
+
active_users_count: active_users_count,
|
29
|
+
score: (score || 0),
|
30
|
+
introduce: introduce,
|
31
|
+
week_data: week_data,
|
32
|
+
posts_data: (posts_data || {})
|
33
|
+
}
|
34
|
+
else
|
35
|
+
{
|
36
|
+
active_users_count: 0,
|
37
|
+
score: 0,
|
38
|
+
introduce: "",
|
39
|
+
week_data: [],
|
40
|
+
posts_data: {}
|
41
|
+
}
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# crawl posts
|
46
|
+
def fetch_post(uuid)
|
47
|
+
nonce = gen_nonce
|
48
|
+
xyz = gen_xyz(nonce, uuid)
|
49
|
+
|
50
|
+
posts = JSON.parse(RestClient.post("http://www.newrank.cn/xdnphb/detail/getAccountArticle", {uuid: uuid, nonce: nonce, xyz: xyz}, {"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36"}))
|
51
|
+
end
|
52
|
+
|
53
|
+
# crawl week data
|
54
|
+
def week_data(doc)
|
55
|
+
data = []
|
56
|
+
|
57
|
+
if !doc.css("script")[0].nil?
|
58
|
+
parser = RKelly::Parser.new
|
59
|
+
ast = parser.parse(doc.css("script")[0].text.strip)
|
60
|
+
array_node = ast.pointcut(RKelly::Nodes::ArrayNode).matches.first
|
61
|
+
array_node.pointcut(RKelly::Nodes::ElementNode).matches.each do |element_node|
|
62
|
+
data << JSON.parse(element_node.to_ecma)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
data
|
67
|
+
end
|
68
|
+
|
69
|
+
# get Nogogiri Document
|
70
|
+
def document(newrank_account)
|
71
|
+
url = 'http://www.newrank.cn/public/info/detail.html?account=' + newrank_account
|
72
|
+
Nokogiri::HTML(open(url, "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", :read_timeout => 10), nil, 'utf-8')
|
73
|
+
end
|
74
|
+
|
75
|
+
# find score and uuid
|
76
|
+
def score_and_uuid(doc)
|
77
|
+
score, uuid = nil
|
78
|
+
|
79
|
+
script = doc.css("script")[0]
|
80
|
+
if !script.nil?
|
81
|
+
parser = RKelly::Parser.new
|
82
|
+
ast = parser.parse(script.text.strip)
|
83
|
+
array_node = ast.pointcut(RKelly::Nodes::ArrayNode).matches.first
|
84
|
+
element_node = array_node.pointcut(RKelly::Nodes::ElementNode).matches.first
|
85
|
+
json_data = element_node.nil? ? {} : JSON.parse(element_node.to_ecma)
|
86
|
+
if json_data["new_rank_index_mark"]
|
87
|
+
score = json_data["new_rank_index_mark"].to_f
|
88
|
+
else
|
89
|
+
score = 0.0
|
90
|
+
end
|
91
|
+
object_node = ast.pointcut(RKelly::Nodes::AssignExprNode).matches[-1]
|
92
|
+
node = object_node.pointcut(RKelly::Nodes::PropertyNode).matches.select{|n| n.name == '"uuid"'}.first.value
|
93
|
+
uuid = node.value[1..-2]
|
94
|
+
end
|
95
|
+
|
96
|
+
return score, uuid
|
97
|
+
end
|
98
|
+
|
99
|
+
# wait for seconds
|
100
|
+
# instead of request too much
|
101
|
+
def wait_for_seconds
|
102
|
+
sleep(1 * rand)
|
103
|
+
end
|
104
|
+
|
105
|
+
# generate parameter nonce
|
106
|
+
def gen_nonce
|
107
|
+
a = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a","b", "c", "d", "e", "f"]
|
108
|
+
b = 0
|
109
|
+
while 500 > b
|
110
|
+
d = 0
|
111
|
+
c = ""
|
112
|
+
while 9 > d
|
113
|
+
e = (16 * rand).floor
|
114
|
+
c << a[e]
|
115
|
+
d = d + 1
|
116
|
+
end
|
117
|
+
b = b + 1
|
118
|
+
end
|
119
|
+
c
|
120
|
+
end
|
121
|
+
|
122
|
+
# generate parameter xyz
|
123
|
+
def gen_xyz(nonce, uuid)
|
124
|
+
h = "/xdnphb/detail/getAccountArticle?AppKey=joker&uuid=#{uuid}&nonce=#{nonce}"
|
125
|
+
_md5(h)
|
126
|
+
end
|
127
|
+
|
128
|
+
# use js md5 algorightm, written by newrank, file in assets/newrank_md5.js
|
129
|
+
def _md5(str)
|
130
|
+
js_context.call('newrank_md5', str, bare: true)
|
131
|
+
end
|
132
|
+
|
133
|
+
# js context
|
134
|
+
def js_context
|
135
|
+
file_path = File.join( File.dirname(__FILE__), './assets/newrank_md5.js')
|
136
|
+
@context ||= ExecJS.compile(File.read(file_path))
|
137
|
+
end
|
138
|
+
#------------------------
|
139
|
+
end
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: newrank
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tesla Lee
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-10-25 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A Crawler for NewRank
|
14
|
+
email: leechee89@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/newrank.rb
|
20
|
+
homepage: https://github.com/liqites/newrank_crawler
|
21
|
+
licenses:
|
22
|
+
- MIT
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.4.5
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Newrank Crawler
|
44
|
+
test_files: []
|