newrank 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/newrank.rb +139 -0
  3. metadata +44 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: eb1cbfb901aa3e8dc9117fc5df14b111596742ee
4
+ data.tar.gz: a95c42da710abe8ca57b132d9cc1365c1b393ef5
5
+ SHA512:
6
+ metadata.gz: 3971adcf2000608d38b9edd52179fe53637ab57ff6d9684569caf643a2324c65ec127c5afa07e1bc2aacee14af9a54a25c0c06f4b4ba1b886f9c2b6519552354
7
+ data.tar.gz: 1bcdf263496f182b74c3c47c843882547ebe64f9e49f39fca2b2fe18fafc5b2778049558bed8f8d3460949c8eaa96c67022e6c3030e7c80afe5e98b38b4e2e75
data/lib/newrank.rb ADDED
@@ -0,0 +1,139 @@
1
+ # coding: utf-8
2
+ require 'open-uri'
3
+ require 'rkelly'
4
+ require 'nokogiri'
5
+ require 'json'
6
+ require 'v8'
7
+ require 'execjs'
8
+ require 'rest-client'
9
+
10
+ class Newrank
11
+ # crawl newrank info
12
+ def crawl(newrank_id)
13
+ doc = document(newrank_id.gsub("\u{a0}",""))
14
+ if !doc.nil?
15
+ score, uuid = score_and_uuid(doc)
16
+
17
+ element = doc.css(".detail-fans-counts")[0]
18
+ active_users_count = element.nil? ? 0 : element.text.gsub(",","").to_i
19
+
20
+ element = doc.css(".info-detail-head-weixin-fun-introduce")[0]
21
+ introduce = element.nil? ? "" : element.text
22
+
23
+ week_data = week_data(doc)
24
+ if !uuid.nil?
25
+ posts_data = fetch_post(uuid)
26
+ end
27
+ {
28
+ active_users_count: active_users_count,
29
+ score: (score || 0),
30
+ introduce: introduce,
31
+ week_data: week_data,
32
+ posts_data: (posts_data || {})
33
+ }
34
+ else
35
+ {
36
+ active_users_count: 0,
37
+ score: 0,
38
+ introduce: "",
39
+ week_data: [],
40
+ posts_data: {}
41
+ }
42
+ end
43
+ end
44
+
45
+ # crawl posts
46
+ def fetch_post(uuid)
47
+ nonce = gen_nonce
48
+ xyz = gen_xyz(nonce, uuid)
49
+
50
+ posts = JSON.parse(RestClient.post("http://www.newrank.cn/xdnphb/detail/getAccountArticle", {uuid: uuid, nonce: nonce, xyz: xyz}, {"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36"}))
51
+ end
52
+
53
+ # crawl week data
54
+ def week_data(doc)
55
+ data = []
56
+
57
+ if !doc.css("script")[0].nil?
58
+ parser = RKelly::Parser.new
59
+ ast = parser.parse(doc.css("script")[0].text.strip)
60
+ array_node = ast.pointcut(RKelly::Nodes::ArrayNode).matches.first
61
+ array_node.pointcut(RKelly::Nodes::ElementNode).matches.each do |element_node|
62
+ data << JSON.parse(element_node.to_ecma)
63
+ end
64
+ end
65
+
66
+ data
67
+ end
68
+
69
+ # get Nogogiri Document
70
+ def document(newrank_account)
71
+ url = 'http://www.newrank.cn/public/info/detail.html?account=' + newrank_account
72
+ Nokogiri::HTML(open(url, "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", :read_timeout => 10), nil, 'utf-8')
73
+ end
74
+
75
+ # find score and uuid
76
+ def score_and_uuid(doc)
77
+ score, uuid = nil
78
+
79
+ script = doc.css("script")[0]
80
+ if !script.nil?
81
+ parser = RKelly::Parser.new
82
+ ast = parser.parse(script.text.strip)
83
+ array_node = ast.pointcut(RKelly::Nodes::ArrayNode).matches.first
84
+ element_node = array_node.pointcut(RKelly::Nodes::ElementNode).matches.first
85
+ json_data = element_node.nil? ? {} : JSON.parse(element_node.to_ecma)
86
+ if json_data["new_rank_index_mark"]
87
+ score = json_data["new_rank_index_mark"].to_f
88
+ else
89
+ score = 0.0
90
+ end
91
+ object_node = ast.pointcut(RKelly::Nodes::AssignExprNode).matches[-1]
92
+ node = object_node.pointcut(RKelly::Nodes::PropertyNode).matches.select{|n| n.name == '"uuid"'}.first.value
93
+ uuid = node.value[1..-2]
94
+ end
95
+
96
+ return score, uuid
97
+ end
98
+
99
+ # wait for seconds
100
+ # instead of request too much
101
+ def wait_for_seconds
102
+ sleep(1 * rand)
103
+ end
104
+
105
+ # generate parameter nonce
106
+ def gen_nonce
107
+ a = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a","b", "c", "d", "e", "f"]
108
+ b = 0
109
+ while 500 > b
110
+ d = 0
111
+ c = ""
112
+ while 9 > d
113
+ e = (16 * rand).floor
114
+ c << a[e]
115
+ d = d + 1
116
+ end
117
+ b = b + 1
118
+ end
119
+ c
120
+ end
121
+
122
+ # generate parameter xyz
123
+ def gen_xyz(nonce, uuid)
124
+ h = "/xdnphb/detail/getAccountArticle?AppKey=joker&uuid=#{uuid}&nonce=#{nonce}"
125
+ _md5(h)
126
+ end
127
+
128
+ # use js md5 algorightm, written by newrank, file in assets/newrank_md5.js
129
+ def _md5(str)
130
+ js_context.call('newrank_md5', str, bare: true)
131
+ end
132
+
133
+ # js context
134
+ def js_context
135
+ file_path = File.join( File.dirname(__FILE__), './assets/newrank_md5.js')
136
+ @context ||= ExecJS.compile(File.read(file_path))
137
+ end
138
+ #------------------------
139
+ end
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: newrank
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Tesla Lee
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-10-25 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A Crawler for NewRank
14
+ email: leechee89@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/newrank.rb
20
+ homepage: https://github.com/liqites/newrank_crawler
21
+ licenses:
22
+ - MIT
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.4.5
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Newrank Crawler
44
+ test_files: []