nicoscraper 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -51,4 +51,5 @@ Rake::RDocTask.new do |rdoc|
51
51
  rdoc.title = "nicoscraper #{version}"
52
52
  rdoc.rdoc_files.include('README*')
53
53
  rdoc.rdoc_files.include('lib/**/*.rb')
54
+ rdoc.rdoc_files.include('lib/*.rb')
54
55
  end
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
data/lib/connector.rb CHANGED
@@ -1,86 +1,264 @@
1
1
  # -*- encoding: utf-8 -*-
2
+ $:.unshift File.dirname(__FILE__)
3
+
2
4
  require 'rubygems'
3
5
  require 'ruby-debug'
4
6
  require 'net/http'
5
7
 
8
+
6
9
  class Connector
7
- def initialize(mode)
8
- @mode = mode
10
+ def initialize
9
11
  # デフォルトのウェイト設定
12
+ @seqTime = 0
13
+
10
14
  @waitConfig = {
11
- 'consec_count' => 10, # 連続してリクエストする回数
12
- 'consec_wait' => 10, # 連続リクエスト後のウェイト
13
- 'each' => 10, # 連続リクエスト時の、1リクエスト毎のウェイト
14
-
15
- '200-abnormal' => 1, # アクセス拒絶時(「短時間での連続アクセスは・・・」)の場合の再試行までの時間
16
- 'unavailable' => 10,
17
- '403' => 1, # "403"時の再試行までのウェイト
18
- '404' => 1, # "403"時の再試行までのウェイト
19
- 'increment' => 1, # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
15
+ 'seqAccLimit' => 10, # 連続してリクエストする回数
16
+ 'afterSeq' => 10, # 連続リクエスト後のウェイト
17
+ 'each' => 1, # 連続リクエスト時の、1リクエスト毎のウェイト
20
18
 
21
- 'timeout' => 5, # タイムアウト時の、再試行までのウェイト
22
- '500' => 1, # "500"時の再試行までのウェイト
23
- '503' => 1, # "503"時の再試行までのウェイト
24
-
25
- 'retryLimit' => 5 # 再試行回数の限度
19
+ 'increment' => 1, # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
20
+ '' => 100,
21
+
22
+ 'deniedSeqReq'=> {
23
+ 'retryLimit' => 3,
24
+ 'wait' => 120
25
+ },
26
+
27
+ 'serverIsBusy'=> {
28
+ 'retryLimit' => 3,
29
+ 'wait' => 120
30
+ },
31
+
32
+ 'serviceUnavailable' => {
33
+ 'retryLimit' => 3,
34
+ 'wait' => 120
35
+ },
36
+
37
+ 'timedOut' => {
38
+ 'retryLimit' => 3,
39
+ 'wait' => 10
40
+ }
26
41
  }
27
-
28
- # 1つの検索結果画面に表示される動画の数。現時点では10個。
29
- @NumOfSearched = 32
30
-
31
- if @mode == "mech"
32
- @mech = Mechanize.new
33
- # メモリ節約のため、Mechanizeの履歴機能をオフにする。
34
- @mech.max_history = 1
35
- end
36
42
 
37
- @consec_count = 0
43
+ @result = {}
38
44
  end
39
45
 
40
46
  private
41
47
 
42
- def mixin(targetObj, overWriteObj)
43
- output = Marshal.load(Marshal.dump(targetObj))
44
- if targetObj.instance_of?(Hash)
45
- overWriteObj.each_key { |key|
46
- overWriteObj[key] = mixin(targetObj[key], overWriteObj[key])
47
- output[key] = overWriteObj[key]
48
- }
49
- else
50
- output = overWriteObj
48
+ def notPublic
49
+ # マイリスト非公開のときに403になる。後で専用の処理を入れるべき。
50
+ puts "This movie/mylist is not public."
51
+ @result = "notPublic"
52
+ return { "order" => "terminate" }
53
+ end
54
+
55
+ def limInCommunity
56
+ puts "This movie/mylist is limited in comunity members."
57
+ # ex. item_id -> 1294702905
58
+ @result = "limInCommunity"
59
+ return { "order" => "terminate" }
60
+ end
61
+
62
+ def notFound
63
+ puts "This movie/mylist is not found."
64
+ @result = "notFound"
65
+ return { "order" => "terminate" }
66
+ end
67
+
68
+ def deleted
69
+ puts "This movie/mylist is deleted."
70
+ @result = "deleted"
71
+ return { "order" => "terminate" }
72
+ end
73
+
74
+ def deniedSeqReq
75
+ puts "Denied sequential requests."
76
+ sleep @waitConfig["deniedSeqReq"]
77
+ @result = "deniedSeqReq"
78
+ return { "order" => "retry" }
79
+ end
80
+
81
+ def serverIsBusy
82
+ puts "The server is busy."
83
+ sleep @waitConfig["serverIsBusy"]
84
+ @result = "serverIsBusy"
85
+ return { "order" => "retry" }
86
+ end
87
+
88
+ def serviceUnavailable
89
+ puts "Service unavailable."
90
+ sleep @waitConfig["serviceUnavailable"]
91
+ @result = "serviceUnavailable"
92
+ return { "order" => "retry" }
93
+ end
94
+
95
+ def timedOut
96
+ puts "Request timed out."
97
+ sleep @waitConfig["timedOut"]
98
+ @result = "timedOut"
99
+ return { "order" => "retry" }
100
+ end
101
+
102
+ def success(resBody)
103
+ sleep @waitConfig["each"]
104
+ @seqTime += 1
105
+
106
+ if @seqTime >= @waitConfig["seqAccLimit"]
107
+ sleep @waitConfig["afterSeq"]
108
+ @seqTime = 0
51
109
  end
52
- return output
110
+ return { "order" => "success", "body" => resBody }
53
111
  end
54
-
55
- public
56
-
112
+
113
+ def wait(status)
114
+ puts "Wait for " + waitTime + " second."
115
+ sleep @waitConfig[status.to_s]
116
+ end
117
+
118
+ public
119
+
57
120
  def setWait(waitConfig)
58
121
  if waitConfig != nil
59
122
  @waitConfig = mixin(@waitConfig, waitConfig)
60
123
  end
61
124
  end
125
+ end
62
126
 
63
- def eachWait
64
- # ウェイト...1回目の場合は無視 -------------------------
65
- if @consec_count != 0
66
- # 動画毎
67
- sleep @wait['each']
68
-
69
- # 一定のリクエスト回数毎
70
- if @consec_count >= @wait['consec_count'] then
71
- sleep @wait['consec_wait']
72
- @consec_count = 0
127
+
128
+ class XmlConnector < Connector
129
+ def get (host, entity)
130
+ response = nil
131
+
132
+ begin
133
+ puts "Request to " + host + entity
134
+ Net::HTTP.start(host, 80) { |http|
135
+ response = http.get(entity)
136
+ }
137
+
138
+ rescue => e
139
+ puts e
140
+ rescue Timeout::Error => e
141
+ timeOut
142
+
143
+ else
144
+ res = case response
145
+ when Net::HTTPSuccess
146
+ reviewRes( response.body.force_encoding("UTF-8") )
147
+ # return response.body.force_encoding("UTF-8")
148
+ # when Net::HTTPRedirection
149
+ # fetch(response['location'], limit - 1)
150
+ when Net::HTTPForbidden
151
+ forbidden
152
+ when Net::HTTPNotFound
153
+ notFound
154
+ when Net::HTTPServiceUnavailable
155
+ serviceUnavailable
156
+ else
157
+ unknownError
158
+ end
159
+ end until res["order"] == "success" ||
160
+ res["order"] == "terminate"
161
+
162
+ res
163
+ end
164
+ end
165
+
166
+ class MylistAtomConnector < XmlConnector
167
+ private
168
+
169
+ def forbidden
170
+ # マイリストが非公開の場合、html/Atomのどちらへのリクエストであっても、403が返ってくる。
171
+ notPublic
172
+ end
173
+
174
+ def reviewRes(resBody)
175
+ if # アクセス集中時
176
+ /大変ご迷惑をおかけいたしますが、しばらく時間をあけてから再度検索いただくようご協力をお願いいたします。/ =~
177
+ resBody.force_encoding("UTF-8")
178
+ then
179
+ serverIsBusy
180
+ else
181
+ success(resBody)
182
+ end
183
+ end
184
+ end
185
+
186
+ class SearchByTagAtomConnector < XmlConnector
187
+ private
188
+
189
+ def forbidden
190
+ # マイリストが非公開の場合、html/Atomのどちらへのリクエストであっても、403が返ってくる。
191
+ notPublic
192
+ end
193
+
194
+ def reviewRes(resBody)
195
+ if # アクセス集中時
196
+ /大変ご迷惑をおかけいたしますが、しばらく時間をあけてから再度検索いただくようご協力をお願いいたします。/ =~
197
+ resBody.force_encoding("UTF-8")
198
+ then
199
+ serverIsBusy
200
+ else
201
+ success(resBody)
202
+ end
203
+ end
204
+ end
205
+
206
+ class GetThumbInfoConnector < XmlConnector
207
+ private
208
+
209
+ def reviewRes(resBody)
210
+ r = resBody.force_encoding("UTF-8")
211
+
212
+ if # getThumbInfoは、該当する動画がない・削除済み・コミュニティ限定でも200が返ってくる。
213
+ /<nicovideo_thumb_response\sstatus=\"fail\">/ =~ r
214
+ if /<code>NOT_FOUND<\/code>/ =~ r
215
+ notFound
216
+ elsif /<code>DELETED<\/code>/ =~ r
217
+ deleted
218
+ elsif /<code>COMMUNITY<\/code>/ =~ r
219
+ limInCommunity
220
+ else
221
+ serverIsBusy
73
222
  end
74
- end
75
- # ------------------------------------------------
223
+ else
224
+ success(resBody)
225
+ end
76
226
  end
227
+ end
77
228
 
78
- def timeOut
79
- sleep @wait['timeout']
80
- @connection = false
81
- @failed += 1
82
- warn "Timeout"
229
+ class HtmlConnector < Connector
230
+ def initialize(mode)
231
+ @mode = mode
232
+ # デフォルトのウェイト設定
233
+ @waitConfig = {
234
+ 'consec_count' => 10, # 連続してリクエストする回数
235
+ 'consec_wait' => 10, # 連続リクエスト後のウェイト
236
+ 'each' => 10, # 連続リクエスト時の、1リクエスト毎のウェイト
237
+
238
+ '200-abnormal' => 300, # アクセス拒絶時(「短時間での連続アクセスは・・・」)の場合の再試行までの時間
239
+ 'unavailable' => 10,
240
+ '403' => 300, # "403"時の再試行までのウェイト
241
+ '404' => 300, # "403"時の再試行までのウェイト
242
+ 'increment' => 1, # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
243
+
244
+ 'timeout' => 10, # タイムアウト時の、再試行までのウェイト
245
+ '500' => 10, # "500"時の再試行までのウェイト
246
+ '503' => 10, # "503"時の再試行までのウェイト
247
+
248
+ 'retryLimit' => 3 # 再試行回数の限度
249
+ }
250
+
251
+ # 1つの検索結果画面に表示される動画の数。現時点では32個がデフォルトの模様。
252
+ @NumOfSearched = 32
253
+
254
+ @mech = Mechanize.new
255
+ # メモリ節約のため、Mechanizeの履歴機能を切る。
256
+ @mech.max_history = 1
257
+
258
+ @consec_count = 0
83
259
  end
260
+
261
+ public
84
262
 
85
263
  def errorStatus(ex)
86
264
  # 再試行回数が
@@ -178,92 +356,5 @@ class Connector
178
356
  return @mech.page
179
357
  end
180
358
 
181
- def xmlGet (host, entity)
182
- response = nil
183
- xmlDoc = nil
184
- retryCount = 0
185
- terminate = false
186
-
187
- begin
188
- puts "Requesting to " + host + entity
189
- Net::HTTP.start(host, 80) { |http|
190
- response = http.get(entity)
191
- }
192
- rescue => e
193
- puts e
194
- rescue Timeout::Error => e
195
- puts e
196
- puts "Timeout."
197
- # マイリスト非公開のときに、403になる。後で専用の処理を入れるべき。
198
- wait("timeout")
199
- retryCount += 1
200
-
201
- if retryCount >= @waitConfig["retryLimit"]
202
- terminate = true
203
- return "failed"
204
- end
205
- else
206
- case response
207
- when Net::HTTPSuccess
208
- unless abnormalRes(response.body)
209
- terminate = true
210
- return response.body.force_encoding("UTF-8")
211
- end
212
- wait("200-abnormal")
213
- retryCount += 1
214
- when Net::HTTPRedirection
215
- fetch(response['location'], limit - 1)
216
- when Net::HTTPForbidden
217
- puts "Access forbidden."
218
- # マイリスト非公開のときに、403になる。後で専用の処理を入れるべき。
219
- wait("403")
220
- retryCount += 1
221
- when Net::HTTPNotFound
222
- puts "Http not found."
223
- wait("404")
224
- retryCount += 1
225
- when Net::HTTPServiceUnavailable
226
- puts "Access rejected or service unavailable."
227
- wait("unavailable")
228
- retryCount += 1
229
- else
230
- puts response.force_encoding("UTF-8")
231
- puts "Unknown error."
232
- wait("other")
233
- retryCount += 1
234
- end
235
-
236
- if retryCount >= @waitConfig["retryLimit"]
237
- terminate = true
238
- return "failed"
239
- end
240
- end until terminate
241
- end
242
-
243
- def abnormalRes(resBody)
244
- if
245
- # mylistRss アクセス集中時
246
- /大変ご迷惑をおかけいたしますが、しばらく時間をあけてから再度検索いただくようご協力をお願いいたします。/ =~ resBody.force_encoding("UTF-8") ||
247
- # getThumbInfo失敗時
248
- /<nicovideo_thumb_response\sstatus=\"fail\">/ =~ resBody
249
- then
250
- puts "!!!!"
251
- true
252
- end
253
- end
254
-
255
- def wait(status)
256
- sleep @waitConfig[status.to_s]
257
- end
258
-
259
- def get (host, entity)
260
- case @mode
261
- when "html"
262
- mechGet(host + entity)
263
- when "atom"
264
- xmlGet(host, entity)
265
- end
266
- end
267
-
268
359
  attr_reader :mech
269
360
  end
data/lib/movie.rb CHANGED
@@ -1,12 +1,14 @@
1
1
  # -*- encoding: utf-8 -*-
2
+ $:.unshift File.dirname(__FILE__)
3
+
2
4
  require 'rubygems'
3
5
  require 'ruby-debug'
4
6
  require 'damerau-levenshtein'
5
7
  require 'kconv'
6
8
 
7
- require 'parser'
8
- require 'mylist'
9
- require 'connector'
9
+ require 'parser.rb'
10
+ require 'mylist.rb'
11
+ require 'connector.rb'
10
12
 
11
13
  class Movie
12
14
  def initialize(video_id)
@@ -19,9 +21,9 @@ class Movie
19
21
  public
20
22
 
21
23
  # 指定されたマイリストに自分が入っていれば、真を返す。
22
- def isBelongsTo (_mylistId, &block)
24
+ def isBelongsTo (mylistId, &block)
23
25
  isBelongs = false
24
- thisMl = Mylist.new(_mylistId)
26
+ thisMl = Mylist.new(mylistId)
25
27
  thisMl.getInfoLt
26
28
 
27
29
  thisMl.movies.each { |movie|
@@ -29,14 +31,14 @@ class Movie
29
31
  }
30
32
 
31
33
  if isBelongs
32
- puts "\sThis movie is found in mylist/" + _mylistId
34
+ puts "\sThis movie is found in mylist/" + mylistId.to_s
33
35
  else
34
- puts "\sThis movie is not found in mylist/" + _mylistId
36
+ puts "\sThis movie is not found in mylist/" + mylistId.to_s
35
37
  end
36
38
 
37
39
  # 無駄なアクセスを省くため、マイリスト中の動画に関する追加処理があれば、
38
40
  # ブロックとして実行できる。
39
- block.call(thisMl)
41
+ block.call(thisMl) if block != nil
40
42
 
41
43
  return isBelongs
42
44
  end
@@ -63,7 +65,7 @@ class Movie
63
65
 
64
66
  mylistIdAry.each { |_mylistId|
65
67
  belongsTo = isBelongsTo(_mylistId) { |mylistObj|
66
- similarity = mylistObj.isSeries
68
+ similarity = mylistObj.getSimilarity
67
69
  puts "\sSimilarity:\t" + similarity.to_s
68
70
  }
69
71
  puts belongsTo
@@ -103,22 +105,20 @@ class Movie
103
105
  end
104
106
 
105
107
  def getInfo
106
- con = Connector.new('xml')
108
+ con = GetThumbInfoConnector.new()
107
109
  host = 'ext.nicovideo.jp'
108
110
  entity = '/api/getthumbinfo/' + @video_id
109
111
  con.setWait(nil)
110
- xml = con.xmlGet(host, entity)
111
-
112
- unless
113
- xml =~ /<nicovideo_thumb_response\sstatus=\"fail\">/ ||
114
- xml == "failed"
112
+ result = con.get(host, entity)
113
+
114
+ if
115
+ result["order"] == "success"
115
116
  then
116
- param = NicoParser.getThumbInfo(xml)
117
- set(param)
117
+ parsed = NicoParser.getThumbInfo(result["body"])
118
+ set(parsed)
118
119
  @available = true
119
120
  else
120
121
  @available = false
121
- return "failed"
122
122
  end
123
123
  end
124
124
 
@@ -239,8 +239,8 @@ class Movie
239
239
  @thumb_type = param
240
240
  when "embeddable"
241
241
  @embeddable = param
242
- when "movieNum_live_play"
243
- @movieNum_live_play = param
242
+ when "no_live_play"
243
+ @no_live_play = param
244
244
  when "tags_jp"
245
245
  @tags_jp = param
246
246
  when "tags_tw"
@@ -290,7 +290,7 @@ class Movie
290
290
  attr_accessor :watch_url
291
291
  attr_accessor :thumb_type
292
292
  attr_accessor :embeddable
293
- attr_accessor :movieNum_live_play
293
+ attr_accessor :no_live_play
294
294
  attr_accessor :tags_jp
295
295
  attr_accessor :tags_tw
296
296
  attr_accessor :user_id
data/lib/mylist.rb CHANGED
@@ -1,11 +1,13 @@
1
1
  # -*- encoding: utf-8 -*-
2
+ $:.unshift File.dirname(__FILE__)
3
+
2
4
  require 'rubygems'
3
5
  require 'ruby-debug'
4
6
  require 'kconv'
5
7
 
6
- require 'parser'
7
- require 'movie'
8
- require 'connector'
8
+ require 'parser.rb'
9
+ require 'movie.rb'
10
+ require 'connector.rb'
9
11
 
10
12
 
11
13
  class Mylist
@@ -15,49 +17,9 @@ class Mylist
15
17
  @available = false
16
18
  end
17
19
 
18
- def simOfTitle
19
- match = false
20
- dl = DamerauLevenshtein
21
- d = 0.0
22
-
23
- puts "matching..."
24
-
25
- # O(n^2)なのでどうにかしたい。しかし、最大でも500C2=125000なので、
26
- # 日々の利用については許容できると思う、
27
- @movies.each { |myself|
28
- @movies.each { |amovieNumther|
29
- _d = dl.distance(myself.title, amovieNumther.title)
30
- d += _d
31
- }
32
- }
33
-
34
- similarity = 1 - ( (d / @movies.length) / title.length )
35
- puts "Similarity: " + similarity.to_s
36
- similarity
37
- end
38
-
39
- def userId
40
- idGroup = {}
41
- samePublisher = false
42
- threshold = 0.9
43
-
44
- @movies.each { |movie|
45
- idGroup[movie.user_id] += 1
46
- }
47
-
48
- idGroup.each { |group|
49
- if @movies.length / threshold < group.length
50
- samePublicher = true
51
- end
52
- }
53
-
54
- return samePublisher
55
- end
56
-
57
- # 自分がシリーズをまとめたマイリストであるかを判定する。
58
- # 判定基準は、1.一定数以上の動画の投稿者が、マイリスト作成者と同じであること。
59
- # 2.タイトルの類似度が、定められた基準以上であること。
60
- def isSeries
20
+ # 自分に含まれている動画のタイトルをすべての組み合わせにおいて比較し、
21
+ # 類似度の平均を返す。
22
+ def getSimilarity
61
23
  l = @movies.length - 1
62
24
  dlc = DamerauLevenshtein
63
25
  dl = 0.0
@@ -180,17 +142,17 @@ class Mylist
180
142
  end
181
143
 
182
144
  def getInfoLt
183
- con = Connector.new('xml')
145
+ con = MylistAtomConnector.new()
184
146
  host = 'www.nicovideo.jp'
185
147
  puts @mylist_id
186
148
  entity = '/mylist/' + @mylist_id.to_s + '?rss=atom&numbers=1'
187
149
  con.setWait(nil)
188
- xml = con.xmlGet(host, entity)
189
-
190
- unless
191
- xml == "failed"
192
- then
193
- parsed = NicoParser.mylistRss(xml)
150
+ result = con.get(host, entity)
151
+
152
+ if
153
+ result["order"] == "success"
154
+ then
155
+ parsed = NicoParser.mylistRss(result["body"])
194
156
 
195
157
  parsed["entry"].each { |e|
196
158
  movie = Movie.new(e["video_id"])
@@ -201,7 +163,9 @@ class Mylist
201
163
 
202
164
  set(parsed["mylist"])
203
165
  @available = true
204
- end
166
+ else
167
+ @available = false
168
+ end
205
169
  end
206
170
 
207
171
  def set(paramObj)
@@ -252,7 +216,8 @@ class Mylist
252
216
  attr_accessor :create_time
253
217
  attr_accessor :update_time
254
218
  attr_accessor :icon_id
255
- attr_accessor :sort_order
219
+ attr_accessor :sort_order
220
+ attr_accessor :author
256
221
 
257
222
  attr_accessor :movies
258
223
  end
data/lib/nicoscraper.rb CHANGED
@@ -3,4 +3,4 @@ require 'ruby-debug'
3
3
 
4
4
  require 'movie'
5
5
  require 'mylist'
6
- require 'getmovie'
6
+ require 'searcher'
data/lib/parser.rb CHANGED
@@ -1,8 +1,10 @@
1
1
  # -*- encoding: utf-8 -*-
2
+ $:.unshift File.dirname(__FILE__)
3
+
2
4
  require 'rubygems'
3
5
  require 'xml'
4
6
  require 'time'
5
- require 'converter'
7
+ require 'converter.rb'
6
8
 
7
9
  module NicoParser
8
10
  public
@@ -152,7 +154,7 @@ module NicoParser
152
154
  end
153
155
  when "subtitle"
154
156
  doc.read
155
- parsed["entry"][n]["description"] = doc.value
157
+ parsed["mylist"]["description"] = doc.value
156
158
  when "id"
157
159
  if n == -1
158
160
  doc.read
@@ -161,7 +163,7 @@ module NicoParser
161
163
  else
162
164
  doc.read
163
165
  parsed["entry"][n]["item_id"] =
164
- Extract.itemId(doc.value)
166
+ Extract.itemId(doc.value)
165
167
  end
166
168
  when "updated"
167
169
  doc.read
@@ -176,54 +178,30 @@ module NicoParser
176
178
  when "content"
177
179
  doc.read
178
180
  html = doc.value
179
-
180
- memo =
181
- html.slice(
182
- /<p\sclass\=\"nico-memo\"\>[^\<]{1,}/
183
- ).to_s.slice(21, 999)
184
-
181
+
182
+ /(<p\sclass=\"nico-memo\"\>)([^\<]{1,})/ =~ html
183
+ memo = $2
184
+
185
185
  /(<p\sclass=\"nico-thumbnail\">.+src=\")(http:\/\/[^\"]{1,})/ =~ html
186
186
  thumbnail_url = $2
187
-
188
- description =
189
- html.slice(
190
- /<p\sclass\=\"nico-description\"\>[^\<]{1,}/
191
- ).to_s.slice(31, 999)
192
-
193
- length =
194
- Convert.toSeconds(
195
- html.slice(
196
- /<strong\sclass\=\"nico-info-length\"\>[^\<]{1,}/
197
- ).to_s.slice(33, 999)
198
- )
199
-
200
- first_retrieve =
201
- Convert.japToUnix(
202
- html.slice(
203
- /<strong\sclass\=\"nico-info-date\"\>[^\<]{1,}/
204
- ).to_s.slice(31, 999)
205
- )
206
-
207
- view =
208
- Convert.commaRemover(
209
- html.slice(
210
- /<strong\sclass\=\"nico-numbers-view\"\>[^\<]{1,}/
211
- ).to_s.slice(34, 999)
212
- )
213
-
214
- res =
215
- Convert.commaRemover(
216
- html.slice(
217
- /<strong\sclass\=\"nico-numbers-res\"\>[^\<]{1,}/
218
- ).to_s.slice(33, 999)
219
- )
220
-
221
- mylist =
222
- Convert.commaRemover(
223
- html.slice(
224
- /<strong\sclass\=\"nico-numbers-mylist\"\>[^\<]{1,}/
225
- ).to_s.slice(36, 999)
226
- )
187
+
188
+ /(<p\sclass\=\"nico-description\"\>)([^\<]{1,})/ =~ html
189
+ description = $2
190
+
191
+ /(<p\sclass\=\"nico-info-length\"\>)([^\<]{1,})/ =~ html
192
+ length = $2
193
+
194
+ /(<p\sclass\=\"nico-info-date\"\>)([^\<]{1,})/ =~ html
195
+ first_retrieve = $2
196
+
197
+ /(<p\sclass\=\"nico-numbers-view\"\>)([^\<]{1,})/ =~ html
198
+ view = $2
199
+
200
+ /(<p\sclass\=\"nico-numbers-res\"\>)([^\<]{1,})/ =~ html
201
+ res = $2
202
+
203
+ /(<p\sclass\=\"nico-numbers-mylist\"\>)([^\<]{1,})/ =~ html
204
+ mylist = $2
227
205
 
228
206
  parsed["entry"][n]["memo"] = memo
229
207
  parsed["entry"][n]["thumbnail_url"] = thumbnail_url
@@ -238,6 +216,7 @@ module NicoParser
238
216
  end
239
217
 
240
218
  doc.close
219
+
241
220
  parsed
242
221
  end
243
222
 
data/lib/searcher.rb CHANGED
@@ -1,4 +1,6 @@
1
- # -*- encoding: utf-8 -*-
1
+ # -*- encoding: utf-8 -*-# -*- encoding: utf-8 -*-
2
+ $:.unshift File.dirname(__FILE__)
3
+
2
4
  require 'rubygems'
3
5
  require 'ruby-debug'
4
6
 
@@ -6,65 +8,13 @@ require 'time'
6
8
  require 'mechanize'
7
9
  require 'kconv'
8
10
 
9
- require 'parser'
10
-
11
-
12
- $wait_byTag = {
13
- 'consec_count' => 10, # 連続してリクエストする回数
14
- 'consec_wait' => 10, # 連続リクエスト後のウェイト
15
- 'each' => 10, # 連続リクエスト時の、1リクエスト毎のウェイト
16
-
17
- 'rejected' => 120, # アクセス拒絶時(「短時間での連続アクセスは・・・」)
18
- # の場合の再試行までの時間
19
- '403' => 600, # "403"時の再試行までのウェイト
20
- 'increment' => 1, # アクセス拒絶時の、次回以降の1リクエスト毎のウェイトの増加量
21
-
22
- 'timeout' => 5, # タイムアウト時の、再試行までのウェイト
23
- '500' => 600, # "500"時の再試行までのウェイト
24
- '503' => 600, # "503"時の再試行までのウェイト
25
-
26
- 'allowance_time'=> 5 # 再試行回数の限度
27
- }
28
-
29
- $wait_byMylistLt = {
30
- 'consec_count' => 10,
31
- 'consec_wait' => 10,
32
- 'each' => 10,
11
+ require 'parser.rb'
33
12
 
34
- 'rejected' => 120,
35
- '403' => 600,
36
- 'increment' => 1,
37
- 'timeout' => 5,
38
- '500' => 600,
39
- '503' => 600,
40
- 'allowance_time'=> 5
41
- }
42
13
 
43
- module GetMovie
44
- public
45
-
46
- def byTag (tag, sort, waitObj, &block)
47
- gMByTag = GetMovieByTag.new()
48
- gMByTag.execute(tag, sort, waitObj) { |result, page|
49
- block.call(result, page)
50
- }
51
- end
52
-
53
- def byTagLt (tag, sort, waitObj, &block)
54
- gMByTagLt = GetMovieByTagLt.new()
55
- gMByTagLt.execute(tag, sort, waitObj) { |result, page|
56
- block.call(result, page)
57
- }
58
- end
59
-
60
- module_function :byTag
61
- module_function :byTagLt
62
- end
63
-
64
- class GetMovieByTagSuper
14
+ class SearchByTagSuper
65
15
  private
66
16
 
67
- def get (tag, sort, page, method, waitObj)
17
+ def get(tag, sort, page, method, waitObj)
68
18
  paramAry = []
69
19
 
70
20
  case sort
@@ -94,7 +44,7 @@ class GetMovieByTagSuper
94
44
  sortStr = 'sort=l&order=a'
95
45
  end
96
46
 
97
- if page != 1 then paramAry.push("page=#{page}"); end
47
+ paramAry.push("page=#{page}") if page != 1
98
48
  paramAry.push(sortStr)
99
49
  if method == "atom" then paramAry.push("rss=atom&numbers=1") end
100
50
  param = tag + "?" + paramAry.join('&')
@@ -102,18 +52,16 @@ class GetMovieByTagSuper
102
52
  host = 'www.nicovideo.jp'
103
53
  entity = '/tag/' + param
104
54
 
105
- @con.setWait(waitObj)
106
- @con.get(host, entity)
55
+ @connector.setWait(waitObj)
56
+ @connector.get(host, entity)
107
57
  end
108
58
 
109
- public
110
-
111
- def loop (tag, sort, method, waitObj, &block)
112
- termFlag = false
113
- page = 1
59
+ def loop(tag, sort, method, waitObj, &block)
60
+ termFlag = false
61
+ page = 1
62
+ movieObjAry = []
114
63
 
115
64
  begin
116
- result = []
117
65
  response = get(
118
66
  tag,
119
67
  sort,
@@ -122,9 +70,16 @@ class GetMovieByTagSuper
122
70
  waitObj
123
71
  )
124
72
 
125
- if response
126
- result = parse(response)
127
- termFlag = block.call(result, page)
73
+ if response["order"] == "success"
74
+ result = parse(response["body"])
75
+ result.each { |each|
76
+ movie = Movie.new(each["video_id"])
77
+ each["available"] = true
78
+ movie.set(each)
79
+ movieObjAry.push(movie)
80
+ }
81
+
82
+ termFlag = block.call(movieObjAry, page)
128
83
  else
129
84
  termFlag = true
130
85
  end
@@ -134,13 +89,12 @@ class GetMovieByTagSuper
134
89
  end
135
90
  end
136
91
 
137
-
138
- class GetMovieByTag < GetMovieByTagSuper
92
+ class SearchByTag < SearchByTagSuper
139
93
  def initialize
140
- @NumOfSearched = 32
94
+ @numOfSearched = 32
141
95
  @incrAmt = 0.2
142
96
 
143
- @con = Connector.new('mech')
97
+ @connector = Connector.new('mech')
144
98
 
145
99
  # HTML中の各パラメータの所在を示すXPath
146
100
  @videoIdXP = "//div[@class='uad_thumbfrm']/table/tr/td/p/a"
@@ -154,16 +108,16 @@ class GetMovieByTag < GetMovieByTagSuper
154
108
  def parse(movieNum)
155
109
  result = []
156
110
 
157
- video_id = /(sm|nm)[0-9]{1,}/.match(@con.mech.page.search(@videoIdXP)[movieNum]['href'])[0]
158
- lengthStr = @con.mech.page.search(@lengthXP)[movieNum].text.split(/\:/)
111
+ video_id = /(sm|nm)[0-9]{1,}/.match(@connector.mech.page.search(@videoIdXP)[movieNum]['href'])[0]
112
+ lengthStr = @connector.mech.page.search(@lengthXP)[movieNum].text.split(/\:/)
159
113
  length = lengthStr[0].to_i * 60 + lengthStr[1].to_i
160
- view = @con.mech.page.search(@viewXP)[movieNum]
114
+ view = @connector.mech.page.search(@viewXP)[movieNum]
161
115
  .text.gsub(/\,/, '').to_i
162
- res = @con.mech.page.search(@resXP)[movieNum]
116
+ res = @connector.mech.page.search(@resXP)[movieNum]
163
117
  .text.gsub(/\,/, '').to_i
164
- mylist = @con.mech.page.search(@mylistXP)[movieNum]
118
+ mylist = @connector.mech.page.search(@mylistXP)[movieNum]
165
119
  .text.gsub(/\,/, '').to_i
166
- ad = @con.mech.page.search(@adXP)[movieNum]
120
+ ad = @connector.mech.page.search(@adXP)[movieNum]
167
121
  .text.gsub(/\,/, '').to_i
168
122
 
169
123
  result.push({
@@ -183,11 +137,11 @@ class GetMovieByTag < GetMovieByTagSuper
183
137
  end
184
138
  end
185
139
 
186
- class GetMovieByTagLt < GetMovieByTagSuper
140
+ class SearchByTagLt < SearchByTagSuper
187
141
  def initialize
188
- @NumOfSearched = 32
142
+ @numOfSearched = 32
189
143
  @incrAmt = 0.2
190
- @con = Connector.new('atom')
144
+ @connector = SearchByTagAtomConnector.new()
191
145
  end
192
146
 
193
147
  def parse(xml)
@@ -199,7 +153,4 @@ class GetMovieByTagLt < GetMovieByTagSuper
199
153
  block.call(result, page)
200
154
  }
201
155
  end
202
- end
203
-
204
-
205
-
156
+ end
data/nicoscraper.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{nicoscraper}
8
- s.version = "0.1.0"
8
+ s.version = "0.2.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = [%q{Masami Yonehara}]
12
- s.date = %q{2011-09-21}
12
+ s.date = %q{2011-09-22}
13
13
  s.description = %q{It scrape movies and mylists of Niconico douga.
14
14
  }
15
15
  s.email = %q{zeitdiebe@gmail.com}
@@ -34,6 +34,7 @@ Gem::Specification.new do |s|
34
34
  "lib/searcher.rb",
35
35
  "nicoscraper.gemspec",
36
36
  "test/helper.rb",
37
+ "test/movie_spec.rb",
37
38
  "test/test_nicoscraper.rb"
38
39
  ]
39
40
  s.homepage = %q{http://github.com/hdemon/nicoscraper}
@@ -0,0 +1,125 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.unshift File.dirname(__FILE__) + "/../lib"
3
+
4
+ require 'movie.rb'
5
+ require 'mylist.rb'
6
+ require 'searcher.rb'
7
+
8
+ describe Movie, "After executiton of 'getInfo' method" do
9
+ before(:all) do
10
+ @movie = Movie.new("sm1097445")
11
+ @movie.getInfo
12
+ end
13
+
14
+ it "should have following values" do
15
+ @movie.available .should be_true
16
+
17
+ @movie.video_id .should == "sm1097445"
18
+ @movie.title .should == "【初音ミク】みくみくにしてあげる♪【してやんよ】"
19
+ @movie.description .should_not be_nil
20
+ @movie.thumbnail_url.should_not be_nil
21
+ @movie.first_retrieve.should == 1190218922
22
+ @movie.length .should == 99
23
+ @movie.movie_type .should == "flv"
24
+ @movie.size_high .should == 3906547
25
+ @movie.size_low .should == 1688098
26
+
27
+ @movie.view_counter .should_not be_nil
28
+ @movie.comment_num .should_not be_nil
29
+ @movie.mylist_counter .should_not be_nil
30
+ @movie.last_res_body.should_not be_nil
31
+
32
+ @movie.watch_url .should == "http://www.nicovideo.jp/watch/sm1097445"
33
+ @movie.thumb_type .should == "video"
34
+ @movie.embeddable .should == 1
35
+ @movie.no_live_play .should == 0
36
+ @movie.tags_jp .should_not be_nil
37
+ @movie.tags_tw .should_not be_nil
38
+ @movie.user_id .should == 70391
39
+ end
40
+
41
+ it "should return true when execute 'isBelongsTo' method." +
42
+ "passing arguments (1450136)" do
43
+ result = @movie.isBelongsTo(1450136)
44
+ result .should_not be_nil
45
+ result .should be_true
46
+ end
47
+ end
48
+
49
+ describe Movie, "when access with non-existent video_id" do
50
+ before(:all) do
51
+ @movie = Movie.new("sm*nonexistent")
52
+ @movie.getInfo
53
+ end
54
+
55
+ it "should be unavailable" do
56
+ @movie.available .should be_false
57
+ end
58
+ end
59
+
60
+ describe Mylist, "After executiton of 'getInfoLt' method" do
61
+ before(:all) do
62
+ @mylist = Mylist.new(15196568)
63
+ @mylist.getInfoLt
64
+ puts @mylist
65
+ end
66
+
67
+ it "should have following values" do
68
+ @mylist.available .should be_true
69
+
70
+ @mylist.mylist_id .should == 15196568
71
+ @mylist.user_id .should be_nil
72
+ @mylist.title .should == "【Oblivion】おっさんの大冒険"
73
+ @mylist.description .should_not be_nil
74
+ @mylist.public .should be_nil
75
+ @mylist.default_sort .should be_nil
76
+ @mylist.create_time .should be_nil
77
+ @mylist.update_time .should_not be_nil
78
+
79
+ @mylist.icon_id .should be_nil
80
+ @mylist.movies .should_not be_nil
81
+ @mylist.movies .should be_kind_of(Array)
82
+ @mylist.author .should == "おぽこ"
83
+ end
84
+
85
+ it "should return over 0.9 when execute 'getSimilarity' method." +
86
+ "passing arguments (1450136)" do
87
+ result = @mylist.getSimilarity
88
+ result .should_not be_nil
89
+ result .should >= 0.9
90
+ end
91
+ end
92
+
93
+ describe "When execute 'SearchByTagLt.execute' method " +
94
+ "passing following argument" do
95
+ before(:all) do
96
+ searcher = SearchByTagLt.new()
97
+ searcher.execute("ゆっくり実況プレイpart1リンク", "post_old", nil) { |result|
98
+ @result = result
99
+ }
100
+ end
101
+
102
+ it "should have Array of movie objects." do
103
+ @result .should be_kind_of(Array)
104
+ @result[0].should be_instance_of(Movie)
105
+ end
106
+
107
+ it "should contains movie objects that have following structure." do
108
+ @result[0].available .should be_true
109
+
110
+ @result[0].video_id .should_not be_nil
111
+ @result[0].title .should_not be_nil
112
+ @result[0].published .should_not be_nil
113
+ @result[0].updated .should_not be_nil
114
+ #@result[0].memo .should_not be_nil
115
+ @result[0].description .should_not be_nil
116
+ @result[0].thumbnail_url.should_not be_nil
117
+ @result[0].published .should_not be_nil
118
+ @result[0].updated .should_not be_nil
119
+ @result[0].length .should_not be_nil
120
+
121
+ @result[0].view_counter .should_not be_nil
122
+ @result[0].comment_num .should_not be_nil
123
+ @result[0].mylist_counter.should_not be_nil
124
+ end
125
+ end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: nicoscraper
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.1.0
5
+ version: 0.2.0
6
6
  platform: ruby
7
7
  authors:
8
8
  - Masami Yonehara
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-09-21 00:00:00 Z
13
+ date: 2011-09-22 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: damerau-levenshtein
@@ -104,6 +104,7 @@ files:
104
104
  - lib/searcher.rb
105
105
  - nicoscraper.gemspec
106
106
  - test/helper.rb
107
+ - test/movie_spec.rb
107
108
  - test/test_nicoscraper.rb
108
109
  homepage: http://github.com/hdemon/nicoscraper
109
110
  licenses:
@@ -118,7 +119,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
118
119
  requirements:
119
120
  - - ">="
120
121
  - !ruby/object:Gem::Version
121
- hash: 4166593295240346995
122
+ hash: -4040185051160948255
122
123
  segments:
123
124
  - 0
124
125
  version: "0"