nicoscraper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +23 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +54 -0
- data/VERSION +1 -0
- data/lib/connector.rb +269 -0
- data/lib/converter.rb +68 -0
- data/lib/movie.rb +297 -0
- data/lib/mylist.rb +258 -0
- data/lib/nicoscraper.rb +6 -0
- data/lib/parser.rb +247 -0
- data/lib/searcher.rb +205 -0
- data/nicoscraper.gemspec +72 -0
- data/test/helper.rb +18 -0
- data/test/test_nicoscraper.rb +7 -0
- metadata +139 -0
data/lib/movie.rb
ADDED
@@ -0,0 +1,297 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'rubygems'
|
3
|
+
require 'ruby-debug'
|
4
|
+
require 'damerau-levenshtein'
|
5
|
+
require 'kconv'
|
6
|
+
|
7
|
+
require 'parser'
|
8
|
+
require 'mylist'
|
9
|
+
require 'connector'
|
10
|
+
|
11
|
+
class Movie
|
12
|
+
def initialize(video_id)
|
13
|
+
@video_id = video_id
|
14
|
+
@available = false
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
public
|
20
|
+
|
21
|
+
# 指定されたマイリストに自分が入っていれば、真を返す。
|
22
|
+
def isBelongsTo (_mylistId, &block)
|
23
|
+
isBelongs = false
|
24
|
+
thisMl = Mylist.new(_mylistId)
|
25
|
+
thisMl.getInfoLt
|
26
|
+
|
27
|
+
thisMl.movies.each { |movie|
|
28
|
+
isBelongs = true if movie.video_id == @video_id
|
29
|
+
}
|
30
|
+
|
31
|
+
if isBelongs
|
32
|
+
puts "\sThis movie is found in mylist/" + _mylistId
|
33
|
+
else
|
34
|
+
puts "\sThis movie is not found in mylist/" + _mylistId
|
35
|
+
end
|
36
|
+
|
37
|
+
# 無駄なアクセスを省くため、マイリスト中の動画に関する追加処理があれば、
|
38
|
+
# ブロックとして実行できる。
|
39
|
+
block.call(thisMl)
|
40
|
+
|
41
|
+
return isBelongs
|
42
|
+
end
|
43
|
+
|
44
|
+
# 自分が含まれる、投稿者の作ったシリーズとしてまとめているマイリストのIDを返す。
|
45
|
+
# 情報取得元が異なるため、必ずしもisBelongsの結果とは包含関係にならない。
|
46
|
+
def isSeriesOf
|
47
|
+
if !@available then
|
48
|
+
puts "This movie object is not available."
|
49
|
+
return "failed"
|
50
|
+
end
|
51
|
+
|
52
|
+
puts
|
53
|
+
puts "Start to discern the seriality of..."
|
54
|
+
puts "\svideo_id:\s\s" + @video_id
|
55
|
+
puts "\stitle:\s\s\s\s\s" + @title
|
56
|
+
# extrMylist呼び出し
|
57
|
+
mylistIdAry = extrMylist
|
58
|
+
sMylistIdAry = []
|
59
|
+
mlObjAry = []
|
60
|
+
mylistId = nil
|
61
|
+
mylist = nil
|
62
|
+
similarity = 0.0
|
63
|
+
|
64
|
+
mylistIdAry.each { |_mylistId|
|
65
|
+
belongsTo = isBelongsTo(_mylistId) { |mylistObj|
|
66
|
+
similarity = mylistObj.isSeries
|
67
|
+
puts "\sSimilarity:\t" + similarity.to_s
|
68
|
+
}
|
69
|
+
puts belongsTo
|
70
|
+
if belongsTo && similarity > 0.7
|
71
|
+
puts "\s" + _mylistId.to_s + "\tis perecieved as series mylist."
|
72
|
+
sMylistIdAry.push(_mylistId)
|
73
|
+
end
|
74
|
+
}
|
75
|
+
|
76
|
+
sMylistIdAry.each { |mylistId|
|
77
|
+
puts mylistId
|
78
|
+
mlObjAry.push( Mylist.new(mylistId) )
|
79
|
+
}
|
80
|
+
|
81
|
+
puts "\sDiscern logic terminated."
|
82
|
+
return mlObjAry
|
83
|
+
end
|
84
|
+
|
85
|
+
# 動画説明文中から、マイリストIDを示す文字列を抜き出す。
|
86
|
+
def extrMylist
|
87
|
+
return if !@available
|
88
|
+
puts "Extracting mylistId from the description..."
|
89
|
+
|
90
|
+
mylistIdAry = []
|
91
|
+
extracted = @description.scan(/mylist\/[0-9]{1,8}/)
|
92
|
+
if extracted[0] != nil
|
93
|
+
extracted.each { |e|
|
94
|
+
id = e.scan(/[0-9]{1,8}/)[0]
|
95
|
+
mylistIdAry.push(id)
|
96
|
+
puts "\sID:\t" + id + " is extracted."
|
97
|
+
}
|
98
|
+
else
|
99
|
+
puts "\sMylistId is not found."
|
100
|
+
end
|
101
|
+
|
102
|
+
return mylistIdAry
|
103
|
+
end
|
104
|
+
|
105
|
+
def getInfo
|
106
|
+
con = Connector.new('xml')
|
107
|
+
host = 'ext.nicovideo.jp'
|
108
|
+
entity = '/api/getthumbinfo/' + @video_id
|
109
|
+
con.setWait(nil)
|
110
|
+
xml = con.xmlGet(host, entity)
|
111
|
+
|
112
|
+
unless
|
113
|
+
xml =~ /<nicovideo_thumb_response\sstatus=\"fail\">/ ||
|
114
|
+
xml == "failed"
|
115
|
+
then
|
116
|
+
param = NicoParser.getThumbInfo(xml)
|
117
|
+
set(param)
|
118
|
+
@available = true
|
119
|
+
else
|
120
|
+
@available = false
|
121
|
+
return "failed"
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def set(paramObj)
|
126
|
+
paramObj.each_key { |key|
|
127
|
+
param = paramObj[key]
|
128
|
+
case key
|
129
|
+
when "available"
|
130
|
+
@available = param
|
131
|
+
|
132
|
+
when "video_id"
|
133
|
+
@video_id = param
|
134
|
+
when "mylist_id"
|
135
|
+
@mylist_id = param
|
136
|
+
when "item_id"
|
137
|
+
@item_id = param
|
138
|
+
when "description"
|
139
|
+
@description = param
|
140
|
+
|
141
|
+
# MylistAPI
|
142
|
+
when "video_id"
|
143
|
+
@video_id = param
|
144
|
+
when "item_id"
|
145
|
+
@item_id = param.to_i
|
146
|
+
when "description"
|
147
|
+
@description = param
|
148
|
+
when "item_data"
|
149
|
+
paramObj['item_data'].each_key { |key|
|
150
|
+
param = paramObj['item_data'][key]
|
151
|
+
case key
|
152
|
+
when "video_id"
|
153
|
+
@video_id = param
|
154
|
+
when "title"
|
155
|
+
@title = param
|
156
|
+
when "thumbnail_url"
|
157
|
+
@thumbnail_url = param
|
158
|
+
when "first_retrieve"
|
159
|
+
@first_retrieve = param
|
160
|
+
when "update_time"
|
161
|
+
@update_time = param
|
162
|
+
when "view_counter"
|
163
|
+
@view_counter = param.to_i
|
164
|
+
when "mylist_counter"
|
165
|
+
@mylist_counter = param.to_i
|
166
|
+
when "num_res"
|
167
|
+
@comment_num = param.to_i
|
168
|
+
when "length_seconds"
|
169
|
+
@length = param
|
170
|
+
when "deleted"
|
171
|
+
@deleted = param.to_i
|
172
|
+
when "last_res_body"
|
173
|
+
@last_res_body = param
|
174
|
+
end
|
175
|
+
}
|
176
|
+
when "watch"
|
177
|
+
@watch = param
|
178
|
+
when "create_time"
|
179
|
+
@create_time = param
|
180
|
+
when "update_time"
|
181
|
+
@update_time = param
|
182
|
+
|
183
|
+
# MylistAPI-Atom
|
184
|
+
when "video_id"
|
185
|
+
@video_id = param
|
186
|
+
when "item_id"
|
187
|
+
@item_id = param
|
188
|
+
when "memo"
|
189
|
+
@memo = param
|
190
|
+
when "published"
|
191
|
+
@published = param
|
192
|
+
when "updated"
|
193
|
+
@updated = param
|
194
|
+
when "thumbnail_url"
|
195
|
+
@thumbnail_url = param
|
196
|
+
when "length"
|
197
|
+
@length = param
|
198
|
+
when "view"
|
199
|
+
@view_counter = param.to_i
|
200
|
+
when "mylist"
|
201
|
+
@mylist_counter = param.to_i
|
202
|
+
when "res"
|
203
|
+
@comment_num = param.to_i
|
204
|
+
when "first_retrieve"
|
205
|
+
@first_retrieve = param
|
206
|
+
when "length"
|
207
|
+
@length = param
|
208
|
+
|
209
|
+
# getThumbInfo
|
210
|
+
when "video_id"
|
211
|
+
@video_id = param
|
212
|
+
when "title"
|
213
|
+
@title = param
|
214
|
+
when "description"
|
215
|
+
@description = param
|
216
|
+
when "thumbnail_url"
|
217
|
+
@thumbnail_url = param
|
218
|
+
when "first_retrieve"
|
219
|
+
@first_retrieve = param
|
220
|
+
when "length"
|
221
|
+
@length = param
|
222
|
+
when "movie_type"
|
223
|
+
@movie_type = param
|
224
|
+
when "size_high"
|
225
|
+
@size_high = param
|
226
|
+
when "size_low"
|
227
|
+
@size_low = param
|
228
|
+
when "view_counter"
|
229
|
+
@view_counter = param
|
230
|
+
when "mylist_counter"
|
231
|
+
@mylist_counter = param
|
232
|
+
when "comment_num"
|
233
|
+
@comment_num = param
|
234
|
+
when "last_res_body"
|
235
|
+
@last_res_body = param
|
236
|
+
when "watch_url"
|
237
|
+
@watch_url = param
|
238
|
+
when "thumb_type"
|
239
|
+
@thumb_type = param
|
240
|
+
when "embeddable"
|
241
|
+
@embeddable = param
|
242
|
+
when "movieNum_live_play"
|
243
|
+
@movieNum_live_play = param
|
244
|
+
when "tags_jp"
|
245
|
+
@tags_jp = param
|
246
|
+
when "tags_tw"
|
247
|
+
@tags_tw = param
|
248
|
+
when "tags_de"
|
249
|
+
@tags_de = param
|
250
|
+
when "tags_sp"
|
251
|
+
@tags_sp = param
|
252
|
+
when "user_id"
|
253
|
+
@user_id = param
|
254
|
+
end
|
255
|
+
}
|
256
|
+
end
|
257
|
+
|
258
|
+
attr_accessor :available
|
259
|
+
|
260
|
+
# MylistAPI
|
261
|
+
attr_accessor :video_id
|
262
|
+
attr_accessor :mylist_id
|
263
|
+
attr_accessor :item_id
|
264
|
+
attr_accessor :description
|
265
|
+
|
266
|
+
attr_accessor :title
|
267
|
+
attr_accessor :thumbnail_url
|
268
|
+
attr_accessor :first_retrieve
|
269
|
+
attr_accessor :update_time
|
270
|
+
attr_accessor :view_counter
|
271
|
+
attr_accessor :mylist_counter
|
272
|
+
attr_accessor :comment_num
|
273
|
+
attr_accessor :length
|
274
|
+
attr_accessor :deleted
|
275
|
+
attr_accessor :last_res_body
|
276
|
+
|
277
|
+
attr_accessor :watch
|
278
|
+
attr_accessor :create_time
|
279
|
+
attr_accessor :update_time
|
280
|
+
|
281
|
+
# MylistAPI-Atom
|
282
|
+
attr_accessor :memo
|
283
|
+
attr_accessor :published
|
284
|
+
attr_accessor :updated
|
285
|
+
|
286
|
+
# getThumbInfo
|
287
|
+
attr_accessor :movie_type
|
288
|
+
attr_accessor :size_high
|
289
|
+
attr_accessor :size_low
|
290
|
+
attr_accessor :watch_url
|
291
|
+
attr_accessor :thumb_type
|
292
|
+
attr_accessor :embeddable
|
293
|
+
attr_accessor :movieNum_live_play
|
294
|
+
attr_accessor :tags_jp
|
295
|
+
attr_accessor :tags_tw
|
296
|
+
attr_accessor :user_id
|
297
|
+
end
|
data/lib/mylist.rb
ADDED
@@ -0,0 +1,258 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'rubygems'
|
3
|
+
require 'ruby-debug'
|
4
|
+
require 'kconv'
|
5
|
+
|
6
|
+
require 'parser'
|
7
|
+
require 'movie'
|
8
|
+
require 'connector'
|
9
|
+
|
10
|
+
|
11
|
+
class Mylist
|
12
|
+
def initialize (mylist_id)
|
13
|
+
@mylist_id = mylist_id
|
14
|
+
@movies = []
|
15
|
+
@available = false
|
16
|
+
end
|
17
|
+
|
18
|
+
def simOfTitle
|
19
|
+
match = false
|
20
|
+
dl = DamerauLevenshtein
|
21
|
+
d = 0.0
|
22
|
+
|
23
|
+
puts "matching..."
|
24
|
+
|
25
|
+
# O(n^2)なのでどうにかしたい。しかし、最大でも500C2=125000なので、
|
26
|
+
# 日々の利用については許容できると思う、
|
27
|
+
@movies.each { |myself|
|
28
|
+
@movies.each { |amovieNumther|
|
29
|
+
_d = dl.distance(myself.title, amovieNumther.title)
|
30
|
+
d += _d
|
31
|
+
}
|
32
|
+
}
|
33
|
+
|
34
|
+
similarity = 1 - ( (d / @movies.length) / title.length )
|
35
|
+
puts "Similarity: " + similarity.to_s
|
36
|
+
similarity
|
37
|
+
end
|
38
|
+
|
39
|
+
def userId
|
40
|
+
idGroup = {}
|
41
|
+
samePublisher = false
|
42
|
+
threshold = 0.9
|
43
|
+
|
44
|
+
@movies.each { |movie|
|
45
|
+
idGroup[movie.user_id] += 1
|
46
|
+
}
|
47
|
+
|
48
|
+
idGroup.each { |group|
|
49
|
+
if @movies.length / threshold < group.length
|
50
|
+
samePublicher = true
|
51
|
+
end
|
52
|
+
}
|
53
|
+
|
54
|
+
return samePublisher
|
55
|
+
end
|
56
|
+
|
57
|
+
# 自分がシリーズをまとめたマイリストであるかを判定する。
|
58
|
+
# 判定基準は、1.一定数以上の動画の投稿者が、マイリスト作成者と同じであること。
|
59
|
+
# 2.タイトルの類似度が、定められた基準以上であること。
|
60
|
+
def isSeries
|
61
|
+
l = @movies.length - 1
|
62
|
+
dlc = DamerauLevenshtein
|
63
|
+
dl = 0.0
|
64
|
+
dlAry = []
|
65
|
+
count_o = 0
|
66
|
+
count_i = 0
|
67
|
+
|
68
|
+
while count_o <= l do
|
69
|
+
count_i = count_o + 1
|
70
|
+
while count_i <= l do
|
71
|
+
dl = dlc.distance(
|
72
|
+
@movies[count_i].title,
|
73
|
+
@movies[count_o].title
|
74
|
+
)
|
75
|
+
|
76
|
+
dl = 1.0 - dl.fdiv( @movies[count_i].title.length)
|
77
|
+
dlAry.push(dl)
|
78
|
+
|
79
|
+
count_i += 1
|
80
|
+
end
|
81
|
+
count_o += 1
|
82
|
+
end
|
83
|
+
|
84
|
+
if l != 0 && dlAry.length > 0
|
85
|
+
t = 0
|
86
|
+
dlAry.each { |_dl| t += _dl }
|
87
|
+
similarity = t / dlAry.length
|
88
|
+
elsif dlAry.length == 0
|
89
|
+
similarity = 0
|
90
|
+
else
|
91
|
+
similarity = 1
|
92
|
+
end
|
93
|
+
|
94
|
+
return similarity
|
95
|
+
end
|
96
|
+
|
97
|
+
def getInfo
|
98
|
+
con = Connector.new('mech')
|
99
|
+
reqUrl = 'http://www.nicovideo.jp' +
|
100
|
+
'/mylist/' + @mylist_id.to_s
|
101
|
+
con.setWait(nil)
|
102
|
+
mechPage = con.mechGet(reqUrl)
|
103
|
+
result = []
|
104
|
+
|
105
|
+
# Mylist自身の情報を取得
|
106
|
+
jsonStr = mechPage.search(
|
107
|
+
"/html/body/div[2]" +
|
108
|
+
"/div/div[2]/script[7]"
|
109
|
+
).to_html
|
110
|
+
|
111
|
+
reg = /MylistGroup\.preloadSingle.{1,}?Mylist\.preload\(/m
|
112
|
+
mlJson = jsonStr.scan(reg)[0]
|
113
|
+
|
114
|
+
id = mlJson.scan(/\sid:[^\n]{1,}/)[0]
|
115
|
+
.scan(/[0-9]{1,}/)[0]
|
116
|
+
user_id = mlJson.scan(/\suser_id:[^\n]{1,}/)[0]
|
117
|
+
.scan(/[0-9]{1,}/)[0]
|
118
|
+
name = mlJson.scan(/\sname:[^\n]{1,}/)[0]
|
119
|
+
name = name.slice(
|
120
|
+
" name: \"".length,
|
121
|
+
name.length - " name: \"".length - "\",\n".length
|
122
|
+
)
|
123
|
+
desc = mlJson.scan(/\sdescription:.{1,}/)[0]
|
124
|
+
desc = desc.slice(
|
125
|
+
" description: \"".length,
|
126
|
+
desc.length - " description: \"".length - "\",\npublic".length
|
127
|
+
)
|
128
|
+
public = mlJson.scan(/\spublic:[^,]{1,}/)[0]
|
129
|
+
.scan(/[0-9]{1,}/)[0]
|
130
|
+
default_sort = mlJson.scan(/\sdefault_sort:[^\n]{1,}/)[0]
|
131
|
+
.scan(/[0-9]{1,}/)[0]
|
132
|
+
create_time = mlJson.scan(/\screate_time:[^\n]{1,}/)[0]
|
133
|
+
.scan(/[0-9]{1,}/)[0]
|
134
|
+
update_time = mlJson.scan(/\supdate_time:[^\n]{1,}/)[0]
|
135
|
+
.scan(/[0-9]{1,}/)[0]
|
136
|
+
icon_id = mlJson.scan(/\sicon_id:[^\n]{1,}/)[0]
|
137
|
+
.scan(/[0-9]{1,}/)[0]
|
138
|
+
|
139
|
+
# mlJson = mlJson.scan(/[^\r\n ]{1,}/).join('')
|
140
|
+
#mlJson = mlJson.scan(/{.+/)[0].split(',')
|
141
|
+
|
142
|
+
# 説明文が空欄だった時の措置。
|
143
|
+
desc = mlJson[3].scan(/\".+\"/)[0]
|
144
|
+
if desc != nil then desc = desc.scan(/[^\"]{1,}/)[0] end
|
145
|
+
|
146
|
+
paramObj = {
|
147
|
+
"id" => id,
|
148
|
+
"user_id" => user_id,
|
149
|
+
"name" => name,
|
150
|
+
"description" => description,
|
151
|
+
"public" => public,
|
152
|
+
"default_sort" => default_sort,
|
153
|
+
"create_time" => create_time,
|
154
|
+
"update_time" => update_time,
|
155
|
+
"icon_id" => icon_id
|
156
|
+
# "sort_order" => ,
|
157
|
+
}
|
158
|
+
set(paramObj)
|
159
|
+
|
160
|
+
# 自分に含まれる動画の情報を取得
|
161
|
+
jsonStr = mechPage.search(
|
162
|
+
"/html/body/div[2]" +
|
163
|
+
"/div/div[2]/script[7]"
|
164
|
+
).to_html
|
165
|
+
|
166
|
+
mvJson = jsonStr.scan(/Mylist.preload.+/)[0]
|
167
|
+
mvJson = mvJson.scan(/\".{1,}/)[0]
|
168
|
+
mvJson = mvJson.slice(0, mvJson.length - 5)
|
169
|
+
#mvJson = mvJson.split('},{')
|
170
|
+
mvJson = Unicode.unescape(mvJson).split('},{')
|
171
|
+
|
172
|
+
mvJson.each { |e|
|
173
|
+
e = "{" + e + "}"
|
174
|
+
param = JSON.parse(e)
|
175
|
+
movie = Movie.new(param['item_data']['video_id'])
|
176
|
+
movie.set(param)
|
177
|
+
|
178
|
+
@movies.push(movie)
|
179
|
+
}
|
180
|
+
end
|
181
|
+
|
182
|
+
def getInfoLt
|
183
|
+
con = Connector.new('xml')
|
184
|
+
host = 'www.nicovideo.jp'
|
185
|
+
puts @mylist_id
|
186
|
+
entity = '/mylist/' + @mylist_id.to_s + '?rss=atom&numbers=1'
|
187
|
+
con.setWait(nil)
|
188
|
+
xml = con.xmlGet(host, entity)
|
189
|
+
|
190
|
+
unless
|
191
|
+
xml == "failed"
|
192
|
+
then
|
193
|
+
parsed = NicoParser.mylistRss(xml)
|
194
|
+
|
195
|
+
parsed["entry"].each { |e|
|
196
|
+
movie = Movie.new(e["video_id"])
|
197
|
+
e["available"] = true
|
198
|
+
movie.set(e)
|
199
|
+
@movies.push(movie)
|
200
|
+
}
|
201
|
+
|
202
|
+
set(parsed["mylist"])
|
203
|
+
@available = true
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def set(paramObj)
|
208
|
+
paramObj.each_key { |key|
|
209
|
+
param = paramObj[key]
|
210
|
+
case key
|
211
|
+
when "mylist_id"
|
212
|
+
@mylist_id = param
|
213
|
+
when "id"
|
214
|
+
@mylist_id = param
|
215
|
+
when "user_id"
|
216
|
+
@user_id = param
|
217
|
+
when "title"
|
218
|
+
@title = param
|
219
|
+
when "description"
|
220
|
+
@description = param
|
221
|
+
when "public"
|
222
|
+
@public = param
|
223
|
+
when "default_sort"
|
224
|
+
@default_sort = param
|
225
|
+
when "create_time"
|
226
|
+
@create_time = param
|
227
|
+
when "update_time"
|
228
|
+
@update_time = param
|
229
|
+
when "icon_id"
|
230
|
+
@icon_id = param
|
231
|
+
when "sort_order"
|
232
|
+
@sort_order = param
|
233
|
+
when "movies"
|
234
|
+
@movies = param
|
235
|
+
|
236
|
+
when "updated"
|
237
|
+
@update_time = param
|
238
|
+
when "author"
|
239
|
+
@author = param
|
240
|
+
end
|
241
|
+
}
|
242
|
+
end
|
243
|
+
|
244
|
+
attr_accessor :available
|
245
|
+
|
246
|
+
attr_accessor :mylist_id
|
247
|
+
attr_accessor :user_id
|
248
|
+
attr_accessor :title
|
249
|
+
attr_accessor :description
|
250
|
+
attr_accessor :public
|
251
|
+
attr_accessor :default_sort
|
252
|
+
attr_accessor :create_time
|
253
|
+
attr_accessor :update_time
|
254
|
+
attr_accessor :icon_id
|
255
|
+
attr_accessor :sort_order
|
256
|
+
|
257
|
+
attr_accessor :movies
|
258
|
+
end
|