newrank 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/newrank.rb +139 -0
  3. metadata +44 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: eb1cbfb901aa3e8dc9117fc5df14b111596742ee
4
+ data.tar.gz: a95c42da710abe8ca57b132d9cc1365c1b393ef5
5
+ SHA512:
6
+ metadata.gz: 3971adcf2000608d38b9edd52179fe53637ab57ff6d9684569caf643a2324c65ec127c5afa07e1bc2aacee14af9a54a25c0c06f4b4ba1b886f9c2b6519552354
7
+ data.tar.gz: 1bcdf263496f182b74c3c47c843882547ebe64f9e49f39fca2b2fe18fafc5b2778049558bed8f8d3460949c8eaa96c67022e6c3030e7c80afe5e98b38b4e2e75
data/lib/newrank.rb ADDED
@@ -0,0 +1,139 @@
1
+ # coding: utf-8
2
+ require 'open-uri'
3
+ require 'rkelly'
4
+ require 'nokogiri'
5
+ require 'json'
6
+ require 'v8'
7
+ require 'execjs'
8
+ require 'rest-client'
9
+
10
+ class Newrank
11
+ # crawl newrank info
12
+ def crawl(newrank_id)
13
+ doc = document(newrank_id.gsub("\u{a0}",""))
14
+ if !doc.nil?
15
+ score, uuid = score_and_uuid(doc)
16
+
17
+ element = doc.css(".detail-fans-counts")[0]
18
+ active_users_count = element.nil? ? 0 : element.text.gsub(",","").to_i
19
+
20
+ element = doc.css(".info-detail-head-weixin-fun-introduce")[0]
21
+ introduce = element.nil? ? "" : element.text
22
+
23
+ week_data = week_data(doc)
24
+ if !uuid.nil?
25
+ posts_data = fetch_post(uuid)
26
+ end
27
+ {
28
+ active_users_count: active_users_count,
29
+ score: (score || 0),
30
+ introduce: introduce,
31
+ week_data: week_data,
32
+ posts_data: (posts_data || {})
33
+ }
34
+ else
35
+ {
36
+ active_users_count: 0,
37
+ score: 0,
38
+ introduce: "",
39
+ week_data: [],
40
+ posts_data: {}
41
+ }
42
+ end
43
+ end
44
+
45
+ # crawl posts
46
+ def fetch_post(uuid)
47
+ nonce = gen_nonce
48
+ xyz = gen_xyz(nonce, uuid)
49
+
50
+ posts = JSON.parse(RestClient.post("http://www.newrank.cn/xdnphb/detail/getAccountArticle", {uuid: uuid, nonce: nonce, xyz: xyz}, {"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36"}))
51
+ end
52
+
53
+ # crawl week data
54
+ def week_data(doc)
55
+ data = []
56
+
57
+ if !doc.css("script")[0].nil?
58
+ parser = RKelly::Parser.new
59
+ ast = parser.parse(doc.css("script")[0].text.strip)
60
+ array_node = ast.pointcut(RKelly::Nodes::ArrayNode).matches.first
61
+ array_node.pointcut(RKelly::Nodes::ElementNode).matches.each do |element_node|
62
+ data << JSON.parse(element_node.to_ecma)
63
+ end
64
+ end
65
+
66
+ data
67
+ end
68
+
69
+ # get Nogogiri Document
70
+ def document(newrank_account)
71
+ url = 'http://www.newrank.cn/public/info/detail.html?account=' + newrank_account
72
+ Nokogiri::HTML(open(url, "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", :read_timeout => 10), nil, 'utf-8')
73
+ end
74
+
75
+ # find score and uuid
76
+ def score_and_uuid(doc)
77
+ score, uuid = nil
78
+
79
+ script = doc.css("script")[0]
80
+ if !script.nil?
81
+ parser = RKelly::Parser.new
82
+ ast = parser.parse(script.text.strip)
83
+ array_node = ast.pointcut(RKelly::Nodes::ArrayNode).matches.first
84
+ element_node = array_node.pointcut(RKelly::Nodes::ElementNode).matches.first
85
+ json_data = element_node.nil? ? {} : JSON.parse(element_node.to_ecma)
86
+ if json_data["new_rank_index_mark"]
87
+ score = json_data["new_rank_index_mark"].to_f
88
+ else
89
+ score = 0.0
90
+ end
91
+ object_node = ast.pointcut(RKelly::Nodes::AssignExprNode).matches[-1]
92
+ node = object_node.pointcut(RKelly::Nodes::PropertyNode).matches.select{|n| n.name == '"uuid"'}.first.value
93
+ uuid = node.value[1..-2]
94
+ end
95
+
96
+ return score, uuid
97
+ end
98
+
99
+ # wait for seconds
100
+ # instead of request too much
101
+ def wait_for_seconds
102
+ sleep(1 * rand)
103
+ end
104
+
105
+ # generate parameter nonce
106
+ def gen_nonce
107
+ a = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a","b", "c", "d", "e", "f"]
108
+ b = 0
109
+ while 500 > b
110
+ d = 0
111
+ c = ""
112
+ while 9 > d
113
+ e = (16 * rand).floor
114
+ c << a[e]
115
+ d = d + 1
116
+ end
117
+ b = b + 1
118
+ end
119
+ c
120
+ end
121
+
122
+ # generate parameter xyz
123
+ def gen_xyz(nonce, uuid)
124
+ h = "/xdnphb/detail/getAccountArticle?AppKey=joker&uuid=#{uuid}&nonce=#{nonce}"
125
+ _md5(h)
126
+ end
127
+
128
+ # use js md5 algorightm, written by newrank, file in assets/newrank_md5.js
129
+ def _md5(str)
130
+ js_context.call('newrank_md5', str, bare: true)
131
+ end
132
+
133
+ # js context
134
+ def js_context
135
+ file_path = File.join( File.dirname(__FILE__), './assets/newrank_md5.js')
136
+ @context ||= ExecJS.compile(File.read(file_path))
137
+ end
138
+ #------------------------
139
+ end
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: newrank
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Tesla Lee
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-10-25 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A Crawler for NewRank
14
+ email: leechee89@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/newrank.rb
20
+ homepage: https://github.com/liqites/newrank_crawler
21
+ licenses:
22
+ - MIT
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.4.5
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Newrank Crawler
44
+ test_files: []