newrank 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/newrank.rb +139 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: eb1cbfb901aa3e8dc9117fc5df14b111596742ee
|
4
|
+
data.tar.gz: a95c42da710abe8ca57b132d9cc1365c1b393ef5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3971adcf2000608d38b9edd52179fe53637ab57ff6d9684569caf643a2324c65ec127c5afa07e1bc2aacee14af9a54a25c0c06f4b4ba1b886f9c2b6519552354
|
7
|
+
data.tar.gz: 1bcdf263496f182b74c3c47c843882547ebe64f9e49f39fca2b2fe18fafc5b2778049558bed8f8d3460949c8eaa96c67022e6c3030e7c80afe5e98b38b4e2e75
|
data/lib/newrank.rb
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'open-uri'
|
3
|
+
require 'rkelly'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'json'
|
6
|
+
require 'v8'
|
7
|
+
require 'execjs'
|
8
|
+
require 'rest-client'
|
9
|
+
|
10
|
+
class Newrank
|
11
|
+
# crawl newrank info
|
12
|
+
def crawl(newrank_id)
|
13
|
+
doc = document(newrank_id.gsub("\u{a0}",""))
|
14
|
+
if !doc.nil?
|
15
|
+
score, uuid = score_and_uuid(doc)
|
16
|
+
|
17
|
+
element = doc.css(".detail-fans-counts")[0]
|
18
|
+
active_users_count = element.nil? ? 0 : element.text.gsub(",","").to_i
|
19
|
+
|
20
|
+
element = doc.css(".info-detail-head-weixin-fun-introduce")[0]
|
21
|
+
introduce = element.nil? ? "" : element.text
|
22
|
+
|
23
|
+
week_data = week_data(doc)
|
24
|
+
if !uuid.nil?
|
25
|
+
posts_data = fetch_post(uuid)
|
26
|
+
end
|
27
|
+
{
|
28
|
+
active_users_count: active_users_count,
|
29
|
+
score: (score || 0),
|
30
|
+
introduce: introduce,
|
31
|
+
week_data: week_data,
|
32
|
+
posts_data: (posts_data || {})
|
33
|
+
}
|
34
|
+
else
|
35
|
+
{
|
36
|
+
active_users_count: 0,
|
37
|
+
score: 0,
|
38
|
+
introduce: "",
|
39
|
+
week_data: [],
|
40
|
+
posts_data: {}
|
41
|
+
}
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# crawl posts
|
46
|
+
def fetch_post(uuid)
|
47
|
+
nonce = gen_nonce
|
48
|
+
xyz = gen_xyz(nonce, uuid)
|
49
|
+
|
50
|
+
posts = JSON.parse(RestClient.post("http://www.newrank.cn/xdnphb/detail/getAccountArticle", {uuid: uuid, nonce: nonce, xyz: xyz}, {"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36"}))
|
51
|
+
end
|
52
|
+
|
53
|
+
# crawl week data
|
54
|
+
def week_data(doc)
|
55
|
+
data = []
|
56
|
+
|
57
|
+
if !doc.css("script")[0].nil?
|
58
|
+
parser = RKelly::Parser.new
|
59
|
+
ast = parser.parse(doc.css("script")[0].text.strip)
|
60
|
+
array_node = ast.pointcut(RKelly::Nodes::ArrayNode).matches.first
|
61
|
+
array_node.pointcut(RKelly::Nodes::ElementNode).matches.each do |element_node|
|
62
|
+
data << JSON.parse(element_node.to_ecma)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
data
|
67
|
+
end
|
68
|
+
|
69
|
+
# get Nogogiri Document
|
70
|
+
def document(newrank_account)
|
71
|
+
url = 'http://www.newrank.cn/public/info/detail.html?account=' + newrank_account
|
72
|
+
Nokogiri::HTML(open(url, "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", :read_timeout => 10), nil, 'utf-8')
|
73
|
+
end
|
74
|
+
|
75
|
+
# find score and uuid
|
76
|
+
def score_and_uuid(doc)
|
77
|
+
score, uuid = nil
|
78
|
+
|
79
|
+
script = doc.css("script")[0]
|
80
|
+
if !script.nil?
|
81
|
+
parser = RKelly::Parser.new
|
82
|
+
ast = parser.parse(script.text.strip)
|
83
|
+
array_node = ast.pointcut(RKelly::Nodes::ArrayNode).matches.first
|
84
|
+
element_node = array_node.pointcut(RKelly::Nodes::ElementNode).matches.first
|
85
|
+
json_data = element_node.nil? ? {} : JSON.parse(element_node.to_ecma)
|
86
|
+
if json_data["new_rank_index_mark"]
|
87
|
+
score = json_data["new_rank_index_mark"].to_f
|
88
|
+
else
|
89
|
+
score = 0.0
|
90
|
+
end
|
91
|
+
object_node = ast.pointcut(RKelly::Nodes::AssignExprNode).matches[-1]
|
92
|
+
node = object_node.pointcut(RKelly::Nodes::PropertyNode).matches.select{|n| n.name == '"uuid"'}.first.value
|
93
|
+
uuid = node.value[1..-2]
|
94
|
+
end
|
95
|
+
|
96
|
+
return score, uuid
|
97
|
+
end
|
98
|
+
|
99
|
+
# wait for seconds
|
100
|
+
# instead of request too much
|
101
|
+
def wait_for_seconds
|
102
|
+
sleep(1 * rand)
|
103
|
+
end
|
104
|
+
|
105
|
+
# generate parameter nonce
|
106
|
+
def gen_nonce
|
107
|
+
a = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a","b", "c", "d", "e", "f"]
|
108
|
+
b = 0
|
109
|
+
while 500 > b
|
110
|
+
d = 0
|
111
|
+
c = ""
|
112
|
+
while 9 > d
|
113
|
+
e = (16 * rand).floor
|
114
|
+
c << a[e]
|
115
|
+
d = d + 1
|
116
|
+
end
|
117
|
+
b = b + 1
|
118
|
+
end
|
119
|
+
c
|
120
|
+
end
|
121
|
+
|
122
|
+
# generate parameter xyz
|
123
|
+
def gen_xyz(nonce, uuid)
|
124
|
+
h = "/xdnphb/detail/getAccountArticle?AppKey=joker&uuid=#{uuid}&nonce=#{nonce}"
|
125
|
+
_md5(h)
|
126
|
+
end
|
127
|
+
|
128
|
+
# use js md5 algorightm, written by newrank, file in assets/newrank_md5.js
|
129
|
+
def _md5(str)
|
130
|
+
js_context.call('newrank_md5', str, bare: true)
|
131
|
+
end
|
132
|
+
|
133
|
+
# js context
|
134
|
+
def js_context
|
135
|
+
file_path = File.join( File.dirname(__FILE__), './assets/newrank_md5.js')
|
136
|
+
@context ||= ExecJS.compile(File.read(file_path))
|
137
|
+
end
|
138
|
+
#------------------------
|
139
|
+
end
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: newrank
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tesla Lee
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-10-25 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A Crawler for NewRank
|
14
|
+
email: leechee89@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/newrank.rb
|
20
|
+
homepage: https://github.com/liqites/newrank_crawler
|
21
|
+
licenses:
|
22
|
+
- MIT
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.4.5
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Newrank Crawler
|
44
|
+
test_files: []
|