nicoscraper 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,318 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.unshift File.dirname(__FILE__)
3
+
4
+ require 'rubygems'
5
+ require 'ruby-debug'
6
+ require 'kconv'
7
+
8
+ require 'parser.rb'
9
+ require 'movie.rb'
10
+ require 'connector.rb'
11
+
12
+
13
+ module Nicos
14
+ class Mylist
15
+ def initialize (mylist_id)
16
+ @mylist_id = mylist_id
17
+ @movies = []
18
+ @available = false
19
+ end
20
+
21
+ # 自分に含まれている動画のタイトルをすべての組み合わせにおいて比較し、
22
+ # 類似度の平均を返す。
23
+ #
24
+ # @return [Fixnum] 編集距離に基づく類似度。上限は1、下限はなし。
25
+ def getSimilarity
26
+ l = @movies.length - 1
27
+ dlc = DamerauLevenshtein
28
+ dl = 0.0
29
+ dlAry = []
30
+ count_o = 0
31
+ count_i = 0
32
+
33
+ while count_o <= l do
34
+ count_i = count_o + 1
35
+ while count_i <= l do
36
+ dl = dlc.distance(
37
+ @movies[count_i].title,
38
+ @movies[count_o].title
39
+ )
40
+
41
+ dl = 1.0 - dl.fdiv( @movies[count_i].title.length)
42
+ dlAry.push(dl)
43
+
44
+ count_i += 1
45
+ end
46
+ count_o += 1
47
+ end
48
+
49
+ if l != 0 && dlAry.length > 0
50
+ t = 0
51
+ dlAry.each { |_dl| t += _dl }
52
+ similarity = t / dlAry.length
53
+ elsif dlAry.length == 0
54
+ similarity = 0
55
+ else
56
+ similarity = 1
57
+ end
58
+
59
+ return similarity
60
+ end
61
+
62
+ # 自分に含まれている動画のタイトルをすべての組み合わせにおいて比較し、
63
+ def getInfoHtml
64
+ con = Nicos::Connector::Html.new('mech')
65
+ reqUrl = 'http://www.nicovideo.jp' +
66
+ '/mylist/' + @mylist_id.to_s
67
+ mechPage = con.mechGet(reqUrl)
68
+ result = []
69
+
70
+ # Mylist自身の情報を取得
71
+ jsonStr = mechPage.search(
72
+ "/html/body/div[2]" +
73
+ "/div/div[2]/script[7]"
74
+ ).to_html
75
+
76
+ reg = /MylistGroup\.preloadSingle.{1,}?Mylist\.preload\(/m
77
+ mlJson = jsonStr.scan(reg)[0]
78
+
79
+ id = mlJson.scan(/\sid:[^\n]{1,}/)[0]
80
+ .scan(/[0-9]{1,}/)[0]
81
+ user_id = mlJson.scan(/\suser_id:[^\n]{1,}/)[0]
82
+ .scan(/[0-9]{1,}/)[0]
83
+ name = mlJson.scan(/\sname:[^\n]{1,}/)[0]
84
+ name = name.slice(
85
+ " name: \"".length,
86
+ name.length - " name: \"".length - "\",\n".length
87
+ )
88
+ desc = mlJson.scan(/\sdescription:.{1,}/)[0]
89
+ desc = desc.slice(
90
+ " description: \"".length,
91
+ desc.length - " description: \"".length - "\",\npublic".length
92
+ )
93
+ public = mlJson.scan(/\spublic:[^,]{1,}/)[0]
94
+ .scan(/[0-9]{1,}/)[0]
95
+ default_sort = mlJson.scan(/\sdefault_sort:[^\n]{1,}/)[0]
96
+ .scan(/[0-9]{1,}/)[0]
97
+ create_time = mlJson.scan(/\screate_time:[^\n]{1,}/)[0]
98
+ .scan(/[0-9]{1,}/)[0]
99
+ update_time = mlJson.scan(/\supdate_time:[^\n]{1,}/)[0]
100
+ .scan(/[0-9]{1,}/)[0]
101
+ icon_id = mlJson.scan(/\sicon_id:[^\n]{1,}/)[0]
102
+ .scan(/[0-9]{1,}/)[0]
103
+
104
+ # mlJson = mlJson.scan(/[^\r\n ]{1,}/).join('')
105
+ #mlJson = mlJson.scan(/{.+/)[0].split(',')
106
+
107
+ # 説明文が空欄だった時の措置。
108
+ desc = mlJson[3].scan(/\".+\"/)[0]
109
+ if desc != nil then desc = desc.scan(/[^\"]{1,}/)[0] end
110
+
111
+ paramObj = {
112
+ "id" => id,
113
+ "user_id" => user_id,
114
+ "name" => name,
115
+ "description" => description,
116
+ "public" => public,
117
+ "default_sort" => default_sort,
118
+ "create_time" => create_time,
119
+ "update_time" => update_time,
120
+ "icon_id" => icon_id
121
+ # "sort_order" => ,
122
+ }
123
+ set(paramObj)
124
+
125
+ # 自分に含まれる動画の情報を取得
126
+ jsonStr = mechPage.search(
127
+ "/html/body/div[2]" +
128
+ "/div/div[2]/script[7]"
129
+ ).to_html
130
+
131
+ mvJson = jsonStr.scan(/Mylist.preload.+/)[0]
132
+ mvJson = mvJson.scan(/\".{1,}/)[0]
133
+ mvJson = mvJson.slice(0, mvJson.length - 5)
134
+ #mvJson = mvJson.split('},{')
135
+ mvJson = Nicos::Unicode.unescape(mvJson).split('},{')
136
+
137
+ mvJson.each { |e|
138
+ e = "{" + e + "}"
139
+ param = JSON.parse(e)
140
+ movie = Nicos::Movie.new(param['item_data']['video_id'])
141
+ movie.set(param)
142
+
143
+ @movies.push(movie)
144
+ }
145
+ end
146
+
147
+ # マイリストのAtomフィードから、マイリストとそれに含まれる動画の情報を取得する。
148
+ #
149
+ # @return [Fixnum] 編集距離に基づく類似度。上限は1、下限はなし。
150
+ def getInfo
151
+ con = Nicos::Connector::MylistAtom.new()
152
+ host = 'www.nicovideo.jp'
153
+ puts @mylist_id
154
+ entity = '/mylist/' + @mylist_id.to_s + '?rss=atom&numbers=1'
155
+ result = con.get(host, entity)
156
+
157
+ if
158
+ result["order"] == "success"
159
+ then
160
+ parsed = Nicos::Parser::mylistAtom(result["body"])
161
+
162
+ parsed["entry"].each { |e|
163
+ movie = Nicos::Movie.new(e["video_id"])
164
+ e["available"] = true
165
+ movie.set(e)
166
+ @movies.push(movie)
167
+ }
168
+
169
+ @available = true
170
+ set(parsed["mylist"])
171
+ else
172
+ @available = false
173
+ end
174
+ end
175
+
176
+ # {Movie#set} を参照。
177
+ def set(paramObj)
178
+ paramObj.each_key { |key|
179
+ param = paramObj[key]
180
+ case key
181
+ when "mylist_id"
182
+ @mylist_id = param
183
+ when "id"
184
+ @mylist_id = param
185
+ when "user_id"
186
+ @user_id = param
187
+ when "title"
188
+ @title = param
189
+ when "description"
190
+ @description = param
191
+ when "public"
192
+ @public = param
193
+ when "default_sort"
194
+ @default_sort = param
195
+ when "create_time"
196
+ @create_time = param
197
+ when "update_time"
198
+ @update_time = param
199
+ when "icon_id"
200
+ @icon_id = param
201
+ when "sort_order"
202
+ @sort_order = param
203
+ when "movies"
204
+ @movies = param
205
+
206
+ when "updated"
207
+ @update_time = param
208
+ when "author"
209
+ @author = param
210
+ end
211
+ }
212
+ end
213
+
214
+ include Nicos::Connector::SetWait
215
+
216
+
217
+ # このインスタンスがgetInfo等によって正常に情報を取得できている場合、trueとなる。
218
+ # 各種メソッドの実行には、これがtrueであることが要求される。
219
+ #
220
+ # @return [Boolean]
221
+ attr_accessor :available
222
+
223
+ # マイリストID
224
+ #
225
+ # @return [Fixnum]
226
+ # <b>取得可能なメソッド</b>
227
+ # {Nicos::Movie#getInfo Mylist::getInfo}
228
+ # {Nicos::Movie#getInfo Mylist::getHtmlInfo}
229
+ attr_accessor :mylist_id
230
+
231
+ # ユーザID
232
+ #
233
+ # @return [Fixnum]
234
+ # <b>取得可能なメソッド</b>
235
+ # {Nicos::Movie#getInfo Mylist::getHtmlInfo}
236
+ attr_accessor :user_id
237
+
238
+ # マイリストのタイトル
239
+ #
240
+ # @return [Fixnum]
241
+ # <b>取得可能なメソッド</b>
242
+ # {Nicos::Movie#getInfo Mylist::getInfo}
243
+ # {Nicos::Movie#getInfo Mylist::getHtmlInfo}
244
+ attr_accessor :title
245
+
246
+ # マイリストの説明文
247
+ #
248
+ # @return [Fixnum]
249
+ # <b>取得可能なメソッド</b>
250
+ # {Nicos::Movie#getInfo Mylist::getInfo}
251
+ # {Nicos::Movie#getInfo Mylist::getHtmlInfo}
252
+ attr_accessor :description
253
+
254
+ # 公開設定
255
+ #
256
+ # 調査中
257
+ # @return [Fixnum]
258
+ # <b>取得可能なメソッド</b>
259
+ # {Nicos::Movie#getInfo Mylist::getInfo}
260
+ # {Nicos::Movie#getInfo Mylist::getHtmlInfo}
261
+ attr_accessor :public
262
+
263
+ # ソート順の設定
264
+ #
265
+ # ソート順の設定
266
+ # @return [Fixnum]
267
+ # <b>取得可能なメソッド</b>
268
+ # {Nicos::Movie#getInfo Mylist::getInfo}
269
+ # {Nicos::Movie#getInfo Mylist::getHtmlInfo}
270
+ attr_accessor :default_sort
271
+
272
+ # マイリスト作成日時
273
+ #
274
+ # @return [Fixnum]
275
+ # <b>取得可能なメソッド</b>
276
+ # {Nicos::Movie#getInfo Mylist::getInfo}
277
+ # {Nicos::Movie#getInfo Mylist::getHtmlInfo}
278
+ attr_accessor :create_time
279
+
280
+ # マイリストの更新日時
281
+ #
282
+ # @return [Fixnum]
283
+ # <b>取得可能なメソッド</b>
284
+ # {Nicos::Movie#getInfo Mylist::getInfo}
285
+ # {Nicos::Movie#getInfo Mylist::getHtmlInfo}
286
+ attr_accessor :update_time
287
+
288
+ # アイコンの色?
289
+ #
290
+ # @return [Fixnum]
291
+ # <b>取得可能なメソッド</b>
292
+ # {Nicos::Movie#getInfo Mylist::getInfo}
293
+ # {Nicos::Movie#getInfo Mylist::getHtmlInfo}
294
+ attr_accessor :icon_id
295
+
296
+ # 現在のソート順
297
+ #
298
+ # @return [Fixnum]
299
+ # <b>取得可能なメソッド</b>
300
+ # {Nicos::Movie#getInfo Mylist::getInfo}
301
+ # {Nicos::Movie#getInfo Mylist::getHtmlInfo}
302
+ attr_accessor :sort_order
303
+
304
+ # 作成者の名前
305
+ #
306
+ # @return [Fixnum]
307
+ # <b>取得可能なメソッド</b>
308
+ # {Nicos::Movie#getInfo Mylist::getInfo}
309
+ attr_accessor :author
310
+
311
+ # マイリストが含む動画インスタンスの配列
312
+ #
313
+ # getInfo等のメソッドを利用した際に、そのマイリストが含む動画の
314
+ # インスタンスが配列として自動的に作られ、moviesに収められる。
315
+ # @return [Array<Movie>]
316
+ attr_accessor :movies
317
+ end
318
+ end
@@ -0,0 +1,235 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.unshift File.dirname(__FILE__)
3
+
4
+ require 'rubygems'
5
+ require 'xml'
6
+ require 'time'
7
+
8
+ require 'converter.rb'
9
+
10
+ module Nicos
11
+ module Parser
12
+ # getThumbInfoが返すXMLを解析し、ハッシュオブジェクトにして返します。
13
+ #
14
+ # @return [HashObj]
15
+ def getThumbInfo(xml)
16
+ doc = XML::Reader.string(
17
+ xml,
18
+ :options => XML::Parser::Options::NOBLANKS |
19
+ XML::Parser::Options::NOENT
20
+ )
21
+
22
+ n = -1
23
+ parsed = {}
24
+ category = ""
25
+
26
+ while doc.read
27
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
28
+ case doc.name
29
+ when "video_id", "title", "description", "thumbnail_url",
30
+ "movie_type", "last_res_body" , "watch_url", "thumb_type"
31
+ label = doc.name
32
+ doc.read
33
+ parsed[label] = doc.value
34
+ when "size_high", "size_low", "view_counter", "comment_num",
35
+ "mylist_counter", "embeddable", "no_live_play",
36
+ "user_id"
37
+ label = doc.name
38
+ doc.read
39
+ parsed[label] = doc.value.to_i
40
+ when "first_retrieve"
41
+ label = doc.name
42
+ doc.read
43
+ parsed[label] = Nicos::Converter.iso8601ToUnix(doc.value)
44
+ when "length"
45
+ doc.read
46
+ lengthStr = doc.value.split(/\:/)
47
+ length = lengthStr[0].to_i * 60 + lengthStr[1].to_i
48
+ parsed["length"] = length
49
+ when "tags"
50
+ doc.move_to_attribute("domain")
51
+ category = doc.value
52
+ if defined? parsed["tags" + category]
53
+ parsed["tags_" + category] = []
54
+ end
55
+ when "tag"
56
+ doc.read
57
+ parsed["tags_" + category].push(doc.value)
58
+ end
59
+ end
60
+ end
61
+
62
+ doc.close
63
+ parsed
64
+ end
65
+
66
+ # タグ検索のAtomフィードが返すXMLを解析し、ハッシュオブジェクトにして返します。
67
+ #
68
+ # @return [HashObj]
69
+ def tagAtom(xml)
70
+ doc = XML::Reader.string(
71
+ xml,
72
+ :options => XML::Parser::Options::NOBLANKS |
73
+ XML::Parser::Options::NOENT
74
+ )
75
+
76
+ n = -1
77
+ parsed = [{}]
78
+
79
+ while doc.read
80
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
81
+ case doc.name
82
+ when "entry"
83
+ n += 1
84
+ parsed[n] = {}
85
+ when "title"
86
+ doc.read
87
+ parsed[n]["title"] = doc.value
88
+ when "link"
89
+ doc.move_to_attribute("href")
90
+ parsed[n]["video_id"] = doc.value.split('/')[4]
91
+ when "published", "updated"
92
+ label = doc.name
93
+ doc.read
94
+ parsed[n][label] = Nicos::Converter.iso8601ToUnix(doc.value)
95
+ when "p"
96
+ doc.move_to_attribute("class")
97
+ case doc.value
98
+ when "nico-thumbnail"
99
+ doc.read
100
+ doc.move_to_attribute("src")
101
+ parsed[n]["thumbnail_url"] = doc.value
102
+ when "nico-description"
103
+ doc.read
104
+ parsed[n]["description"] = doc.value
105
+ end
106
+ when "strong"
107
+ doc.move_to_attribute("class")
108
+ case doc.value
109
+ when "nico-info-length"
110
+ doc.read
111
+ lengthStr = doc.value.split(/\:/)
112
+ length = lengthStr[0].to_i * 60 + lengthStr[1].to_i
113
+ parsed[n]["length"] = length
114
+ when "nico-numbers-view", "nico-numbers-res",
115
+ "nico-numbers-mylist"
116
+ label = doc.value
117
+ doc.read
118
+ parsed[n][label.slice(13,99)] = doc.value.to_i
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ doc.close
125
+ parsed
126
+ end
127
+
128
+ # マイリストのAtomフィードが返すXMLを解析し、ハッシュオブジェクトにして返します。
129
+ #
130
+ # @return [HashObj]
131
+ def mylistAtom(xml)
132
+ doc = XML::Reader.string(
133
+ xml,
134
+ :options => XML::Parser::Options::NOBLANKS |
135
+ XML::Parser::Options::NOENT
136
+ )
137
+
138
+ n = -1
139
+ parsed = { "mylist" => {}, "entry" => [{}] }
140
+ while doc.read
141
+ unless doc.node_type == XML::Reader::TYPE_END_ELEMENT
142
+ case doc.name
143
+
144
+ # <title> and <id> are marked up both in mylist and
145
+ # each entry's node. So we need to assign the value to the
146
+ # appropriate variable in accordance with node's location.
147
+ when "title"
148
+ if n == -1
149
+ doc.read
150
+ d = doc.value
151
+ tmp = doc.value.slice(6, 99)
152
+ tmp = tmp.slice(0, tmp.length - 7)
153
+ parsed["mylist"]["title"] = tmp
154
+ else
155
+ doc.read
156
+ parsed["entry"][n]["title"] = doc.value
157
+ end
158
+ when "link"
159
+ if n != -1
160
+ doc.move_to_attribute("href")
161
+ parsed["entry"][n]["video_id"] =
162
+ Nicos::Extractor.videoId(doc.value)
163
+ end
164
+ when "subtitle"
165
+ doc.read
166
+ parsed["mylist"]["description"] = doc.value
167
+ when "id"
168
+ if n == -1
169
+ doc.read
170
+ parsed["mylist"]["mylist_id"] =
171
+ Nicos::Extractor.mylistId(doc.value)
172
+ else
173
+ doc.read
174
+ parsed["entry"][n]["item_id"] =
175
+ Nicos::Extractor.itemId(doc.value)
176
+ end
177
+ when "updated"
178
+ doc.read
179
+ parsed["mylist"]["updated"] =
180
+ Nicos::Converter.iso8601ToUnix(doc.value)
181
+ when "name"
182
+ doc.read
183
+ parsed["mylist"]["author"] = doc.value
184
+ when "entry"
185
+ n += 1
186
+ parsed["entry"][n] = {}
187
+ when "content"
188
+ doc.read
189
+ html = doc.value
190
+
191
+ /(<p\sclass=\"nico-memo\"\>)([^\<]{1,})/ =~ html
192
+ memo = $2
193
+
194
+ /(<p\sclass=\"nico-thumbnail\">.+src=\")(http:\/\/[^\"]{1,})/ =~ html
195
+ thumbnail_url = $2
196
+
197
+ /(<p\sclass\=\"nico-description\"\>)([^\<]{1,})/ =~ html
198
+ description = $2
199
+
200
+ /(<p\sclass\=\"nico-info-length\"\>)([^\<]{1,})/ =~ html
201
+ length = $2
202
+
203
+ /(<p\sclass\=\"nico-info-date\"\>)([^\<]{1,})/ =~ html
204
+ first_retrieve = $2
205
+
206
+ /(<p\sclass\=\"nico-numbers-view\"\>)([^\<]{1,})/ =~ html
207
+ view = $2
208
+
209
+ /(<p\sclass\=\"nico-numbers-res\"\>)([^\<]{1,})/ =~ html
210
+ res = $2
211
+
212
+ /(<p\sclass\=\"nico-numbers-mylist\"\>)([^\<]{1,})/ =~ html
213
+ mylist = $2
214
+
215
+ parsed["entry"][n]["memo"] = memo
216
+ parsed["entry"][n]["thumbnail_url"] = thumbnail_url
217
+ parsed["entry"][n]["description"] = description
218
+ parsed["entry"][n]["length"] = length
219
+ parsed["entry"][n]["first_retrieve"] = first_retrieve
220
+ parsed["entry"][n]["view"] = view
221
+ parsed["entry"][n]["res"] = res
222
+ parsed["entry"][n]["mylist"] = mylist
223
+ end
224
+ end
225
+ end
226
+
227
+ doc.close
228
+ parsed
229
+ end
230
+
231
+ module_function :tagAtom
232
+ module_function :mylistAtom
233
+ module_function :getThumbInfo
234
+ end
235
+ end