ruby-youtube-dl 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in ruby-youtube-dl.gemspec
4
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ puts `#{File.expand_path(File.dirname(__FILE__))}/youtube-dl.py #{ARGV.join(" ")}`
data/bin/youtube-dl.py ADDED
@@ -0,0 +1,4055 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ __author__ = (
5
+ 'Ricardo Garcia Gonzalez',
6
+ 'Danny Colligan',
7
+ 'Benjamin Johnson',
8
+ 'Vasyl\' Vavrychuk',
9
+ 'Witold Baryluk',
10
+ 'Paweł Paprota',
11
+ 'Gergely Imreh',
12
+ 'Rogério Brito',
13
+ 'Philipp Hagemeister',
14
+ 'Sören Schulze',
15
+ )
16
+
17
+ __license__ = 'Public Domain'
18
+ __version__ = '2011.10.19'
19
+
20
+ UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
21
+
22
+ import cookielib
23
+ import datetime
24
+ import gzip
25
+ import htmlentitydefs
26
+ import HTMLParser
27
+ import httplib
28
+ import locale
29
+ import math
30
+ import netrc
31
+ import os
32
+ import os.path
33
+ import re
34
+ import socket
35
+ import string
36
+ import subprocess
37
+ import sys
38
+ import time
39
+ import urllib
40
+ import urllib2
41
+ import warnings
42
+ import zlib
43
+
44
+ if os.name == 'nt':
45
+ import ctypes
46
+
47
+ try:
48
+ import email.utils
49
+ except ImportError: # Python 2.4
50
+ import email.Utils
51
+ try:
52
+ import cStringIO as StringIO
53
+ except ImportError:
54
+ import StringIO
55
+
56
+ # parse_qs was moved from the cgi module to the urlparse module recently.
57
+ try:
58
+ from urlparse import parse_qs
59
+ except ImportError:
60
+ from cgi import parse_qs
61
+
62
+ try:
63
+ import lxml.etree
64
+ except ImportError:
65
+ pass # Handled below
66
+
67
+ try:
68
+ import xml.etree.ElementTree
69
+ except ImportError: # Python<2.5: Not officially supported, but let it slip
70
+ warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
71
+
72
+ std_headers = {
73
+ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74
+ 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76
+ 'Accept-Encoding': 'gzip, deflate',
77
+ 'Accept-Language': 'en-us,en;q=0.5',
78
+ }
79
+
80
+ simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
81
+
82
+ try:
83
+ import json
84
+ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
85
+ import re
86
+ class json(object):
87
+ @staticmethod
88
+ def loads(s):
89
+ s = s.decode('UTF-8')
90
+ def raiseError(msg, i):
91
+ raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92
+ def skipSpace(i, expectMore=True):
93
+ while i < len(s) and s[i] in ' \t\r\n':
94
+ i += 1
95
+ if expectMore:
96
+ if i >= len(s):
97
+ raiseError('Premature end', i)
98
+ return i
99
+ def decodeEscape(match):
100
+ esc = match.group(1)
101
+ _STATIC = {
102
+ '"': '"',
103
+ '\\': '\\',
104
+ '/': '/',
105
+ 'b': unichr(0x8),
106
+ 'f': unichr(0xc),
107
+ 'n': '\n',
108
+ 'r': '\r',
109
+ 't': '\t',
110
+ }
111
+ if esc in _STATIC:
112
+ return _STATIC[esc]
113
+ if esc[0] == 'u':
114
+ if len(esc) == 1+4:
115
+ return unichr(int(esc[1:5], 16))
116
+ if len(esc) == 5+6 and esc[5:7] == '\\u':
117
+ hi = int(esc[1:5], 16)
118
+ low = int(esc[7:11], 16)
119
+ return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120
+ raise ValueError('Unknown escape ' + str(esc))
121
+ def parseString(i):
122
+ i += 1
123
+ e = i
124
+ while True:
125
+ e = s.index('"', e)
126
+ bslashes = 0
127
+ while s[e-bslashes-1] == '\\':
128
+ bslashes += 1
129
+ if bslashes % 2 == 1:
130
+ e += 1
131
+ continue
132
+ break
133
+ rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134
+ stri = rexp.sub(decodeEscape, s[i:e])
135
+ return (e+1,stri)
136
+ def parseObj(i):
137
+ i += 1
138
+ res = {}
139
+ i = skipSpace(i)
140
+ if s[i] == '}': # Empty dictionary
141
+ return (i+1,res)
142
+ while True:
143
+ if s[i] != '"':
144
+ raiseError('Expected a string object key', i)
145
+ i,key = parseString(i)
146
+ i = skipSpace(i)
147
+ if i >= len(s) or s[i] != ':':
148
+ raiseError('Expected a colon', i)
149
+ i,val = parse(i+1)
150
+ res[key] = val
151
+ i = skipSpace(i)
152
+ if s[i] == '}':
153
+ return (i+1, res)
154
+ if s[i] != ',':
155
+ raiseError('Expected comma or closing curly brace', i)
156
+ i = skipSpace(i+1)
157
+ def parseArray(i):
158
+ res = []
159
+ i = skipSpace(i+1)
160
+ if s[i] == ']': # Empty array
161
+ return (i+1,res)
162
+ while True:
163
+ i,val = parse(i)
164
+ res.append(val)
165
+ i = skipSpace(i) # Raise exception if premature end
166
+ if s[i] == ']':
167
+ return (i+1, res)
168
+ if s[i] != ',':
169
+ raiseError('Expected a comma or closing bracket', i)
170
+ i = skipSpace(i+1)
171
+ def parseDiscrete(i):
172
+ for k,v in {'true': True, 'false': False, 'null': None}.items():
173
+ if s.startswith(k, i):
174
+ return (i+len(k), v)
175
+ raiseError('Not a boolean (or null)', i)
176
+ def parseNumber(i):
177
+ mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
178
+ if mobj is None:
179
+ raiseError('Not a number', i)
180
+ nums = mobj.group(1)
181
+ if '.' in nums or 'e' in nums or 'E' in nums:
182
+ return (i+len(nums), float(nums))
183
+ return (i+len(nums), int(nums))
184
+ CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
185
+ def parse(i):
186
+ i = skipSpace(i)
187
+ i,res = CHARMAP.get(s[i], parseNumber)(i)
188
+ i = skipSpace(i, False)
189
+ return (i,res)
190
+ i,res = parse(0)
191
+ if i < len(s):
192
+ raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
193
+ return res
194
+
195
+ def preferredencoding():
196
+ """Get preferred encoding.
197
+
198
+ Returns the best encoding scheme for the system, based on
199
+ locale.getpreferredencoding() and some further tweaks.
200
+ """
201
+ def yield_preferredencoding():
202
+ try:
203
+ pref = locale.getpreferredencoding()
204
+ u'TEST'.encode(pref)
205
+ except:
206
+ pref = 'UTF-8'
207
+ while True:
208
+ yield pref
209
+ return yield_preferredencoding().next()
210
+
211
+
212
+ def htmlentity_transform(matchobj):
213
+ """Transforms an HTML entity to a Unicode character.
214
+
215
+ This function receives a match object and is intended to be used with
216
+ the re.sub() function.
217
+ """
218
+ entity = matchobj.group(1)
219
+
220
+ # Known non-numeric HTML entity
221
+ if entity in htmlentitydefs.name2codepoint:
222
+ return unichr(htmlentitydefs.name2codepoint[entity])
223
+
224
+ # Unicode character
225
+ mobj = re.match(ur'(?u)#(x?\d+)', entity)
226
+ if mobj is not None:
227
+ numstr = mobj.group(1)
228
+ if numstr.startswith(u'x'):
229
+ base = 16
230
+ numstr = u'0%s' % numstr
231
+ else:
232
+ base = 10
233
+ return unichr(long(numstr, base))
234
+
235
+ # Unknown entity in name, return its literal representation
236
+ return (u'&%s;' % entity)
237
+
238
+
239
+ def sanitize_title(utitle):
240
+ """Sanitizes a video title so it could be used as part of a filename."""
241
+ utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242
+ return utitle.replace(unicode(os.sep), u'%')
243
+
244
+
245
+ def sanitize_open(filename, open_mode):
246
+ """Try to open the given filename, and slightly tweak it if this fails.
247
+
248
+ Attempts to open the given filename. If this fails, it tries to change
249
+ the filename slightly, step by step, until it's either able to open it
250
+ or it fails and raises a final exception, like the standard open()
251
+ function.
252
+
253
+ It returns the tuple (stream, definitive_file_name).
254
+ """
255
+ try:
256
+ if filename == u'-':
257
+ if sys.platform == 'win32':
258
+ import msvcrt
259
+ msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260
+ return (sys.stdout, filename)
261
+ stream = open(filename, open_mode)
262
+ return (stream, filename)
263
+ except (IOError, OSError), err:
264
+ # In case of error, try to remove win32 forbidden chars
265
+ filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
266
+
267
+ # An exception here should be caught in the caller
268
+ stream = open(filename, open_mode)
269
+ return (stream, filename)
270
+
271
+
272
+ def timeconvert(timestr):
273
+ """Convert RFC 2822 defined time string into system timestamp"""
274
+ timestamp = None
275
+ timetuple = email.utils.parsedate_tz(timestr)
276
+ if timetuple is not None:
277
+ timestamp = email.utils.mktime_tz(timetuple)
278
+ return timestamp
279
+
280
+
281
+ class DownloadError(Exception):
282
+ """Download Error exception.
283
+
284
+ This exception may be thrown by FileDownloader objects if they are not
285
+ configured to continue on errors. They will contain the appropriate
286
+ error message.
287
+ """
288
+ pass
289
+
290
+
291
+ class SameFileError(Exception):
292
+ """Same File exception.
293
+
294
+ This exception will be thrown by FileDownloader objects if they detect
295
+ multiple files would have to be downloaded to the same file on disk.
296
+ """
297
+ pass
298
+
299
+
300
+ class PostProcessingError(Exception):
301
+ """Post Processing exception.
302
+
303
+ This exception may be raised by PostProcessor's .run() method to
304
+ indicate an error in the postprocessing task.
305
+ """
306
+ pass
307
+
308
+
309
+ class UnavailableVideoError(Exception):
310
+ """Unavailable Format exception.
311
+
312
+ This exception will be thrown when a video is requested
313
+ in a format that is not available for that video.
314
+ """
315
+ pass
316
+
317
+
318
+ class ContentTooShortError(Exception):
319
+ """Content Too Short exception.
320
+
321
+ This exception may be raised by FileDownloader objects when a file they
322
+ download is too small for what the server announced first, indicating
323
+ the connection was probably interrupted.
324
+ """
325
+ # Both in bytes
326
+ downloaded = None
327
+ expected = None
328
+
329
+ def __init__(self, downloaded, expected):
330
+ self.downloaded = downloaded
331
+ self.expected = expected
332
+
333
+
334
+ class YoutubeDLHandler(urllib2.HTTPHandler):
335
+ """Handler for HTTP requests and responses.
336
+
337
+ This class, when installed with an OpenerDirector, automatically adds
338
+ the standard headers to every HTTP request and handles gzipped and
339
+ deflated responses from web servers. If compression is to be avoided in
340
+ a particular request, the original request in the program code only has
341
+ to include the HTTP header "Youtubedl-No-Compression", which will be
342
+ removed before making the real request.
343
+
344
+ Part of this code was copied from:
345
+
346
+ http://techknack.net/python-urllib2-handlers/
347
+
348
+ Andrew Rowls, the author of that code, agreed to release it to the
349
+ public domain.
350
+ """
351
+
352
+ @staticmethod
353
+ def deflate(data):
354
+ try:
355
+ return zlib.decompress(data, -zlib.MAX_WBITS)
356
+ except zlib.error:
357
+ return zlib.decompress(data)
358
+
359
+ @staticmethod
360
+ def addinfourl_wrapper(stream, headers, url, code):
361
+ if hasattr(urllib2.addinfourl, 'getcode'):
362
+ return urllib2.addinfourl(stream, headers, url, code)
363
+ ret = urllib2.addinfourl(stream, headers, url)
364
+ ret.code = code
365
+ return ret
366
+
367
+ def http_request(self, req):
368
+ for h in std_headers:
369
+ if h in req.headers:
370
+ del req.headers[h]
371
+ req.add_header(h, std_headers[h])
372
+ if 'Youtubedl-no-compression' in req.headers:
373
+ if 'Accept-encoding' in req.headers:
374
+ del req.headers['Accept-encoding']
375
+ del req.headers['Youtubedl-no-compression']
376
+ return req
377
+
378
+ def http_response(self, req, resp):
379
+ old_resp = resp
380
+ # gzip
381
+ if resp.headers.get('Content-encoding', '') == 'gzip':
382
+ gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383
+ resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384
+ resp.msg = old_resp.msg
385
+ # deflate
386
+ if resp.headers.get('Content-encoding', '') == 'deflate':
387
+ gz = StringIO.StringIO(self.deflate(resp.read()))
388
+ resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389
+ resp.msg = old_resp.msg
390
+ return resp
391
+
392
+
393
+ class FileDownloader(object):
394
+ """File Downloader class.
395
+
396
+ File downloader objects are the ones responsible of downloading the
397
+ actual video file and writing it to disk if the user has requested
398
+ it, among some other tasks. In most cases there should be one per
399
+ program. As, given a video URL, the downloader doesn't know how to
400
+ extract all the needed information, task that InfoExtractors do, it
401
+ has to pass the URL to one of them.
402
+
403
+ For this, file downloader objects have a method that allows
404
+ InfoExtractors to be registered in a given order. When it is passed
405
+ a URL, the file downloader handles it to the first InfoExtractor it
406
+ finds that reports being able to handle it. The InfoExtractor extracts
407
+ all the information about the video or videos the URL refers to, and
408
+ asks the FileDownloader to process the video information, possibly
409
+ downloading the video.
410
+
411
+ File downloaders accept a lot of parameters. In order not to saturate
412
+ the object constructor with arguments, it receives a dictionary of
413
+ options instead. These options are available through the params
414
+ attribute for the InfoExtractors to use. The FileDownloader also
415
+ registers itself as the downloader in charge for the InfoExtractors
416
+ that are added to it, so this is a "mutual registration".
417
+
418
+ Available options:
419
+
420
+ username: Username for authentication purposes.
421
+ password: Password for authentication purposes.
422
+ usenetrc: Use netrc for authentication instead.
423
+ quiet: Do not print messages to stdout.
424
+ forceurl: Force printing final URL.
425
+ forcetitle: Force printing title.
426
+ forcethumbnail: Force printing thumbnail URL.
427
+ forcedescription: Force printing description.
428
+ forcefilename: Force printing final filename.
429
+ simulate: Do not download the video files.
430
+ format: Video format code.
431
+ format_limit: Highest quality format to try.
432
+ outtmpl: Template for output names.
433
+ ignoreerrors: Do not stop on download errors.
434
+ ratelimit: Download speed limit, in bytes/sec.
435
+ nooverwrites: Prevent overwriting files.
436
+ retries: Number of times to retry for HTTP error 5xx
437
+ continuedl: Try to continue downloads if possible.
438
+ noprogress: Do not print the progress bar.
439
+ playliststart: Playlist item to start at.
440
+ playlistend: Playlist item to end at.
441
+ matchtitle: Download only matching titles.
442
+ rejecttitle: Reject downloads for matching titles.
443
+ logtostderr: Log messages to stderr instead of stdout.
444
+ consoletitle: Display progress in console window's titlebar.
445
+ nopart: Do not use temporary .part files.
446
+ updatetime: Use the Last-modified header to set output file timestamps.
447
+ writedescription: Write the video description to a .description file
448
+ writeinfojson: Write the video description to a .info.json file
449
+ """
450
+
451
+ params = None
452
+ _ies = []
453
+ _pps = []
454
+ _download_retcode = None
455
+ _num_downloads = None
456
+ _screen_file = None
457
+
458
+ def __init__(self, params):
459
+ """Create a FileDownloader object with the given options."""
460
+ self._ies = []
461
+ self._pps = []
462
+ self._download_retcode = 0
463
+ self._num_downloads = 0
464
+ self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
465
+ self.params = params
466
+
467
+ @staticmethod
468
+ def format_bytes(bytes):
469
+ if bytes is None:
470
+ return 'N/A'
471
+ if type(bytes) is str:
472
+ bytes = float(bytes)
473
+ if bytes == 0.0:
474
+ exponent = 0
475
+ else:
476
+ exponent = long(math.log(bytes, 1024.0))
477
+ suffix = 'bkMGTPEZY'[exponent]
478
+ converted = float(bytes) / float(1024 ** exponent)
479
+ return '%.2f%s' % (converted, suffix)
480
+
481
+ @staticmethod
482
+ def calc_percent(byte_counter, data_len):
483
+ if data_len is None:
484
+ return '---.-%'
485
+ return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
486
+
487
+ @staticmethod
488
+ def calc_eta(start, now, total, current):
489
+ if total is None:
490
+ return '--:--'
491
+ dif = now - start
492
+ if current == 0 or dif < 0.001: # One millisecond
493
+ return '--:--'
494
+ rate = float(current) / dif
495
+ eta = long((float(total) - float(current)) / rate)
496
+ (eta_mins, eta_secs) = divmod(eta, 60)
497
+ if eta_mins > 99:
498
+ return '--:--'
499
+ return '%02d:%02d' % (eta_mins, eta_secs)
500
+
501
+ @staticmethod
502
+ def calc_speed(start, now, bytes):
503
+ dif = now - start
504
+ if bytes == 0 or dif < 0.001: # One millisecond
505
+ return '%10s' % '---b/s'
506
+ return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
507
+
508
+ @staticmethod
509
+ def best_block_size(elapsed_time, bytes):
510
+ new_min = max(bytes / 2.0, 1.0)
511
+ new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512
+ if elapsed_time < 0.001:
513
+ return long(new_max)
514
+ rate = bytes / elapsed_time
515
+ if rate > new_max:
516
+ return long(new_max)
517
+ if rate < new_min:
518
+ return long(new_min)
519
+ return long(rate)
520
+
521
+ @staticmethod
522
+ def parse_bytes(bytestr):
523
+ """Parse a string indicating a byte quantity into a long integer."""
524
+ matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
525
+ if matchobj is None:
526
+ return None
527
+ number = float(matchobj.group(1))
528
+ multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529
+ return long(round(number * multiplier))
530
+
531
+ def add_info_extractor(self, ie):
532
+ """Add an InfoExtractor object to the end of the list."""
533
+ self._ies.append(ie)
534
+ ie.set_downloader(self)
535
+
536
+ def add_post_processor(self, pp):
537
+ """Add a PostProcessor object to the end of the chain."""
538
+ self._pps.append(pp)
539
+ pp.set_downloader(self)
540
+
541
+ def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542
+ """Print message to stdout if not in quiet mode."""
543
+ try:
544
+ if not self.params.get('quiet', False):
545
+ terminator = [u'\n', u''][skip_eol]
546
+ print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547
+ self._screen_file.flush()
548
+ except (UnicodeEncodeError), err:
549
+ if not ignore_encoding_errors:
550
+ raise
551
+
552
+ def to_stderr(self, message):
553
+ """Print message to stderr."""
554
+ print >>sys.stderr, message.encode(preferredencoding())
555
+
556
+ def to_cons_title(self, message):
557
+ """Set console/terminal window title to message."""
558
+ if not self.params.get('consoletitle', False):
559
+ return
560
+ if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561
+ # c_wchar_p() might not be necessary if `message` is
562
+ # already of type unicode()
563
+ ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564
+ elif 'TERM' in os.environ:
565
+ sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
566
+
567
+ def fixed_template(self):
568
+ """Checks if the output template is fixed."""
569
+ return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
570
+
571
+ def trouble(self, message=None):
572
+ """Determine action to take when a download problem appears.
573
+
574
+ Depending on if the downloader has been configured to ignore
575
+ download errors or not, this method may throw an exception or
576
+ not when errors are found, after printing the message.
577
+ """
578
+ if message is not None:
579
+ self.to_stderr(message)
580
+ if not self.params.get('ignoreerrors', False):
581
+ raise DownloadError(message)
582
+ self._download_retcode = 1
583
+
584
+ def slow_down(self, start_time, byte_counter):
585
+ """Sleep if the download speed is over the rate limit."""
586
+ rate_limit = self.params.get('ratelimit', None)
587
+ if rate_limit is None or byte_counter == 0:
588
+ return
589
+ now = time.time()
590
+ elapsed = now - start_time
591
+ if elapsed <= 0.0:
592
+ return
593
+ speed = float(byte_counter) / elapsed
594
+ if speed > rate_limit:
595
+ time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
596
+
597
+ def temp_name(self, filename):
598
+ """Returns a temporary filename for the given filename."""
599
+ if self.params.get('nopart', False) or filename == u'-' or \
600
+ (os.path.exists(filename) and not os.path.isfile(filename)):
601
+ return filename
602
+ return filename + u'.part'
603
+
604
+ def undo_temp_name(self, filename):
605
+ if filename.endswith(u'.part'):
606
+ return filename[:-len(u'.part')]
607
+ return filename
608
+
609
+ def try_rename(self, old_filename, new_filename):
610
+ try:
611
+ if old_filename == new_filename:
612
+ return
613
+ os.rename(old_filename, new_filename)
614
+ except (IOError, OSError), err:
615
+ self.trouble(u'ERROR: unable to rename file')
616
+
617
+ def try_utime(self, filename, last_modified_hdr):
618
+ """Try to set the last-modified time of the given file."""
619
+ if last_modified_hdr is None:
620
+ return
621
+ if not os.path.isfile(filename):
622
+ return
623
+ timestr = last_modified_hdr
624
+ if timestr is None:
625
+ return
626
+ filetime = timeconvert(timestr)
627
+ if filetime is None:
628
+ return filetime
629
+ try:
630
+ os.utime(filename, (time.time(), filetime))
631
+ except:
632
+ pass
633
+ return filetime
634
+
635
+ def report_writedescription(self, descfn):
636
+ """ Report that the description file is being written """
637
+ self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
638
+
639
+ def report_writeinfojson(self, infofn):
640
+ """ Report that the metadata file has been written """
641
+ self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
642
+
643
+ def report_destination(self, filename):
644
+ """Report destination filename."""
645
+ self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
646
+
647
+ def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648
+ """Report download progress."""
649
+ if self.params.get('noprogress', False):
650
+ return
651
+ self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652
+ (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653
+ self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654
+ (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
655
+
656
+ def report_resuming_byte(self, resume_len):
657
+ """Report attempt to resume at given byte."""
658
+ self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
659
+
660
+ def report_retry(self, count, retries):
661
+ """Report retry in case of HTTP error 5xx"""
662
+ self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
663
+
664
+ def report_file_already_downloaded(self, file_name):
665
+ """Report file has already been fully downloaded."""
666
+ try:
667
+ self.to_screen(u'[download] %s has already been downloaded' % file_name)
668
+ except (UnicodeEncodeError), err:
669
+ self.to_screen(u'[download] The file has already been downloaded')
670
+
671
+ def report_unable_to_resume(self):
672
+ """Report it was impossible to resume download."""
673
+ self.to_screen(u'[download] Unable to resume')
674
+
675
+ def report_finish(self):
676
+ """Report download finished."""
677
+ if self.params.get('noprogress', False):
678
+ self.to_screen(u'[download] Download completed')
679
+ else:
680
+ self.to_screen(u'')
681
+
682
+ def increment_downloads(self):
683
+ """Increment the ordinal that assigns a number to each file."""
684
+ self._num_downloads += 1
685
+
686
+ def prepare_filename(self, info_dict):
687
+ """Generate the output filename."""
688
+ try:
689
+ template_dict = dict(info_dict)
690
+ template_dict['epoch'] = unicode(long(time.time()))
691
+ template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692
+ filename = self.params['outtmpl'] % template_dict
693
+ return filename
694
+ except (ValueError, KeyError), err:
695
+ self.trouble(u'ERROR: invalid system charset or erroneous output template')
696
+ return None
697
+
698
+ def process_info(self, info_dict):
699
+ """Process a single dictionary returned by an InfoExtractor."""
700
+ filename = self.prepare_filename(info_dict)
701
+
702
+ # Forced printings
703
+ if self.params.get('forcetitle', False):
704
+ print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705
+ if self.params.get('forceurl', False):
706
+ print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707
+ if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708
+ print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709
+ if self.params.get('forcedescription', False) and 'description' in info_dict:
710
+ print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711
+ if self.params.get('forcefilename', False) and filename is not None:
712
+ print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713
+ if self.params.get('forceformat', False):
714
+ print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
715
+
716
+ # Do nothing else if in simulate mode
717
+ if self.params.get('simulate', False):
718
+ return
719
+
720
+ if filename is None:
721
+ return
722
+
723
+ matchtitle=self.params.get('matchtitle',False)
724
+ rejecttitle=self.params.get('rejecttitle',False)
725
+ title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
726
+ if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
727
+ self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
728
+ return
729
+ if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
730
+ self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
731
+ return
732
+
733
+ if self.params.get('nooverwrites', False) and os.path.exists(filename):
734
+ self.to_stderr(u'WARNING: file exists and will be skipped')
735
+ return
736
+
737
+ try:
738
+ dn = os.path.dirname(filename)
739
+ if dn != '' and not os.path.exists(dn):
740
+ os.makedirs(dn)
741
+ except (OSError, IOError), err:
742
+ self.trouble(u'ERROR: unable to create directory ' + unicode(err))
743
+ return
744
+
745
+ if self.params.get('writedescription', False):
746
+ try:
747
+ descfn = filename + '.description'
748
+ self.report_writedescription(descfn)
749
+ descfile = open(descfn, 'wb')
750
+ try:
751
+ descfile.write(info_dict['description'].encode('utf-8'))
752
+ finally:
753
+ descfile.close()
754
+ except (OSError, IOError):
755
+ self.trouble(u'ERROR: Cannot write description file ' + descfn)
756
+ return
757
+
758
+ if self.params.get('writeinfojson', False):
759
+ infofn = filename + '.info.json'
760
+ self.report_writeinfojson(infofn)
761
+ try:
762
+ json.dump
763
+ except (NameError,AttributeError):
764
+ self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
765
+ return
766
+ try:
767
+ infof = open(infofn, 'wb')
768
+ try:
769
+ json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
770
+ json.dump(json_info_dict, infof)
771
+ finally:
772
+ infof.close()
773
+ except (OSError, IOError):
774
+ self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
775
+ return
776
+
777
+ if not self.params.get('skip_download', False):
778
+ try:
779
+ success = self._do_download(filename, info_dict)
780
+ except (OSError, IOError), err:
781
+ raise UnavailableVideoError
782
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
783
+ self.trouble(u'ERROR: unable to download video data: %s' % str(err))
784
+ return
785
+ except (ContentTooShortError, ), err:
786
+ self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
787
+ return
788
+
789
+ if success:
790
+ try:
791
+ self.post_process(filename, info_dict)
792
+ except (PostProcessingError), err:
793
+ self.trouble(u'ERROR: postprocessing: %s' % str(err))
794
+ return
795
+
796
+ def download(self, url_list):
797
+ """Download a given list of URLs."""
798
+ if len(url_list) > 1 and self.fixed_template():
799
+ raise SameFileError(self.params['outtmpl'])
800
+
801
+ for url in url_list:
802
+ suitable_found = False
803
+ for ie in self._ies:
804
+ # Go to next InfoExtractor if not suitable
805
+ if not ie.suitable(url):
806
+ continue
807
+
808
+ # Suitable InfoExtractor found
809
+ suitable_found = True
810
+
811
+ # Extract information from URL and process it
812
+ ie.extract(url)
813
+
814
+ # Suitable InfoExtractor had been found; go to next URL
815
+ break
816
+
817
+ if not suitable_found:
818
+ self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
819
+
820
+ return self._download_retcode
821
+
822
+ def post_process(self, filename, ie_info):
823
+ """Run the postprocessing chain on the given file."""
824
+ info = dict(ie_info)
825
+ info['filepath'] = filename
826
+ for pp in self._pps:
827
+ info = pp.run(info)
828
+ if info is None:
829
+ break
830
+
831
+ def _download_with_rtmpdump(self, filename, url, player_url):
832
+ self.report_destination(filename)
833
+ tmpfilename = self.temp_name(filename)
834
+
835
+ # Check for rtmpdump first
836
+ try:
837
+ subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
838
+ except (OSError, IOError):
839
+ self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
840
+ return False
841
+
842
+ # Download using rtmpdump. rtmpdump returns exit code 2 when
843
+ # the connection was interrumpted and resuming appears to be
844
+ # possible. This is part of rtmpdump's normal usage, AFAIK.
845
+ basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
846
+ retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
847
+ while retval == 2 or retval == 1:
848
+ prevsize = os.path.getsize(tmpfilename)
849
+ self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
850
+ time.sleep(5.0) # This seems to be needed
851
+ retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
852
+ cursize = os.path.getsize(tmpfilename)
853
+ if prevsize == cursize and retval == 1:
854
+ break
855
+ # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
856
+ if prevsize == cursize and retval == 2 and cursize > 1024:
857
+ self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
858
+ retval = 0
859
+ break
860
+ if retval == 0:
861
+ self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
862
+ self.try_rename(tmpfilename, filename)
863
+ return True
864
+ else:
865
+ self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
866
+ return False
867
+
868
+ def _do_download(self, filename, info_dict):
869
+ url = info_dict['url']
870
+ player_url = info_dict.get('player_url', None)
871
+
872
+ # Check file already present
873
+ if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
874
+ self.report_file_already_downloaded(filename)
875
+ return True
876
+
877
+ # Attempt to download using rtmpdump
878
+ if url.startswith('rtmp'):
879
+ return self._download_with_rtmpdump(filename, url, player_url)
880
+
881
+ tmpfilename = self.temp_name(filename)
882
+ stream = None
883
+
884
+ # Do not include the Accept-Encoding header
885
+ headers = {'Youtubedl-no-compression': 'True'}
886
+ basic_request = urllib2.Request(url, None, headers)
887
+ request = urllib2.Request(url, None, headers)
888
+
889
+ # Establish possible resume length
890
+ if os.path.isfile(tmpfilename):
891
+ resume_len = os.path.getsize(tmpfilename)
892
+ else:
893
+ resume_len = 0
894
+
895
+ open_mode = 'wb'
896
+ if resume_len != 0:
897
+ if self.params.get('continuedl', False):
898
+ self.report_resuming_byte(resume_len)
899
+ request.add_header('Range','bytes=%d-' % resume_len)
900
+ open_mode = 'ab'
901
+ else:
902
+ resume_len = 0
903
+
904
+ count = 0
905
+ retries = self.params.get('retries', 0)
906
+ while count <= retries:
907
+ # Establish connection
908
+ try:
909
+ if count == 0 and 'urlhandle' in info_dict:
910
+ data = info_dict['urlhandle']
911
+ data = urllib2.urlopen(request)
912
+ break
913
+ except (urllib2.HTTPError, ), err:
914
+ if (err.code < 500 or err.code >= 600) and err.code != 416:
915
+ # Unexpected HTTP error
916
+ raise
917
+ elif err.code == 416:
918
+ # Unable to resume (requested range not satisfiable)
919
+ try:
920
+ # Open the connection again without the range header
921
+ data = urllib2.urlopen(basic_request)
922
+ content_length = data.info()['Content-Length']
923
+ except (urllib2.HTTPError, ), err:
924
+ if err.code < 500 or err.code >= 600:
925
+ raise
926
+ else:
927
+ # Examine the reported length
928
+ if (content_length is not None and
929
+ (resume_len - 100 < long(content_length) < resume_len + 100)):
930
+ # The file had already been fully downloaded.
931
+ # Explanation to the above condition: in issue #175 it was revealed that
932
+ # YouTube sometimes adds or removes a few bytes from the end of the file,
933
+ # changing the file size slightly and causing problems for some users. So
934
+ # I decided to implement a suggested change and consider the file
935
+ # completely downloaded if the file size differs less than 100 bytes from
936
+ # the one in the hard drive.
937
+ self.report_file_already_downloaded(filename)
938
+ self.try_rename(tmpfilename, filename)
939
+ return True
940
+ else:
941
+ # The length does not match, we start the download over
942
+ self.report_unable_to_resume()
943
+ open_mode = 'wb'
944
+ break
945
+ # Retry
946
+ count += 1
947
+ if count <= retries:
948
+ self.report_retry(count, retries)
949
+
950
+ if count > retries:
951
+ self.trouble(u'ERROR: giving up after %s retries' % retries)
952
+ return False
953
+
954
+ data_len = data.info().get('Content-length', None)
955
+ if data_len is not None:
956
+ data_len = long(data_len) + resume_len
957
+ data_len_str = self.format_bytes(data_len)
958
+ byte_counter = 0 + resume_len
959
+ block_size = 1024
960
+ start = time.time()
961
+ while True:
962
+ # Download and write
963
+ before = time.time()
964
+ data_block = data.read(block_size)
965
+ after = time.time()
966
+ if len(data_block) == 0:
967
+ break
968
+ byte_counter += len(data_block)
969
+
970
+ # Open file just in time
971
+ if stream is None:
972
+ try:
973
+ (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
974
+ assert stream is not None
975
+ filename = self.undo_temp_name(tmpfilename)
976
+ self.report_destination(filename)
977
+ except (OSError, IOError), err:
978
+ self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
979
+ return False
980
+ try:
981
+ stream.write(data_block)
982
+ except (IOError, OSError), err:
983
+ self.trouble(u'\nERROR: unable to write data: %s' % str(err))
984
+ return False
985
+ block_size = self.best_block_size(after - before, len(data_block))
986
+
987
+ # Progress message
988
+ speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
989
+ if data_len is None:
990
+ self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
991
+ else:
992
+ percent_str = self.calc_percent(byte_counter, data_len)
993
+ eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
994
+ self.report_progress(percent_str, data_len_str, speed_str, eta_str)
995
+
996
+ # Apply rate limit
997
+ self.slow_down(start, byte_counter - resume_len)
998
+
999
+ if stream is None:
1000
+ self.trouble(u'\nERROR: Did not get any data blocks')
1001
+ return False
1002
+ stream.close()
1003
+ self.report_finish()
1004
+ if data_len is not None and byte_counter != data_len:
1005
+ raise ContentTooShortError(byte_counter, long(data_len))
1006
+ self.try_rename(tmpfilename, filename)
1007
+
1008
+ # Update file modification time
1009
+ if self.params.get('updatetime', True):
1010
+ info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1011
+
1012
+ return True
1013
+
1014
+
1015
+ class InfoExtractor(object):
1016
+ """Information Extractor class.
1017
+
1018
+ Information extractors are the classes that, given a URL, extract
1019
+ information from the video (or videos) the URL refers to. This
1020
+ information includes the real video URL, the video title and simplified
1021
+ title, author and others. The information is stored in a dictionary
1022
+ which is then passed to the FileDownloader. The FileDownloader
1023
+ processes this information possibly downloading the video to the file
1024
+ system, among other possible outcomes. The dictionaries must include
1025
+ the following fields:
1026
+
1027
+ id: Video identifier.
1028
+ url: Final video URL.
1029
+ uploader: Nickname of the video uploader.
1030
+ title: Literal title.
1031
+ stitle: Simplified title.
1032
+ ext: Video filename extension.
1033
+ format: Video format.
1034
+ player_url: SWF Player URL (may be None).
1035
+
1036
+ The following fields are optional. Their primary purpose is to allow
1037
+ youtube-dl to serve as the backend for a video search function, such
1038
+ as the one in youtube2mp3. They are only used when their respective
1039
+ forced printing functions are called:
1040
+
1041
+ thumbnail: Full URL to a video thumbnail image.
1042
+ description: One-line video description.
1043
+
1044
+ Subclasses of this one should re-define the _real_initialize() and
1045
+ _real_extract() methods and define a _VALID_URL regexp.
1046
+ Probably, they should also be added to the list of extractors.
1047
+ """
1048
+
1049
+ _ready = False
1050
+ _downloader = None
1051
+
1052
+ def __init__(self, downloader=None):
1053
+ """Constructor. Receives an optional downloader."""
1054
+ self._ready = False
1055
+ self.set_downloader(downloader)
1056
+
1057
+ def suitable(self, url):
1058
+ """Receives a URL and returns True if suitable for this IE."""
1059
+ return re.match(self._VALID_URL, url) is not None
1060
+
1061
+ def initialize(self):
1062
+ """Initializes an instance (authentication, etc)."""
1063
+ if not self._ready:
1064
+ self._real_initialize()
1065
+ self._ready = True
1066
+
1067
+ def extract(self, url):
1068
+ """Extracts URL information and returns it in list of dicts."""
1069
+ self.initialize()
1070
+ return self._real_extract(url)
1071
+
1072
+ def set_downloader(self, downloader):
1073
+ """Sets the downloader for this IE."""
1074
+ self._downloader = downloader
1075
+
1076
+ def _real_initialize(self):
1077
+ """Real initialization process. Redefine in subclasses."""
1078
+ pass
1079
+
1080
+ def _real_extract(self, url):
1081
+ """Real extraction process. Redefine in subclasses."""
1082
+ pass
1083
+
1084
+
1085
+ class YoutubeIE(InfoExtractor):
1086
+ """Information extractor for youtube.com."""
1087
+
1088
+ _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1089
+ _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1090
+ _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1091
+ _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1092
+ _NETRC_MACHINE = 'youtube'
1093
+ # Listed in order of quality
1094
+ _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1095
+ _video_extensions = {
1096
+ '13': '3gp',
1097
+ '17': 'mp4',
1098
+ '18': 'mp4',
1099
+ '22': 'mp4',
1100
+ '37': 'mp4',
1101
+ '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1102
+ '43': 'webm',
1103
+ '44': 'webm',
1104
+ '45': 'webm',
1105
+ }
1106
+ _video_dimensions = {
1107
+ '5': '240x400',
1108
+ '6': '???',
1109
+ '13': '???',
1110
+ '17': '144x176',
1111
+ '18': '360x640',
1112
+ '22': '720x1280',
1113
+ '34': '360x640',
1114
+ '35': '480x854',
1115
+ '37': '1080x1920',
1116
+ '38': '3072x4096',
1117
+ '43': '360x640',
1118
+ '44': '480x854',
1119
+ '45': '720x1280',
1120
+ }
1121
+ IE_NAME = u'youtube'
1122
+
1123
+ def report_lang(self):
1124
+ """Report attempt to set language."""
1125
+ self._downloader.to_screen(u'[youtube] Setting language')
1126
+
1127
+ def report_login(self):
1128
+ """Report attempt to log in."""
1129
+ self._downloader.to_screen(u'[youtube] Logging in')
1130
+
1131
+ def report_age_confirmation(self):
1132
+ """Report attempt to confirm age."""
1133
+ self._downloader.to_screen(u'[youtube] Confirming age')
1134
+
1135
+ def report_video_webpage_download(self, video_id):
1136
+ """Report attempt to download video webpage."""
1137
+ self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1138
+
1139
+ def report_video_info_webpage_download(self, video_id):
1140
+ """Report attempt to download video info webpage."""
1141
+ self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1142
+
1143
+ def report_information_extraction(self, video_id):
1144
+ """Report attempt to extract video information."""
1145
+ self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1146
+
1147
+ def report_unavailable_format(self, video_id, format):
1148
+ """Report extracted video URL."""
1149
+ self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1150
+
1151
+ def report_rtmp_download(self):
1152
+ """Indicate the download will use the RTMP protocol."""
1153
+ self._downloader.to_screen(u'[youtube] RTMP download detected')
1154
+
1155
+ def _print_formats(self, formats):
1156
+ print 'Available formats:'
1157
+ for x in formats:
1158
+ print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1159
+
1160
+ def _real_initialize(self):
1161
+ if self._downloader is None:
1162
+ return
1163
+
1164
+ username = None
1165
+ password = None
1166
+ downloader_params = self._downloader.params
1167
+
1168
+ # Attempt to use provided username and password or .netrc data
1169
+ if downloader_params.get('username', None) is not None:
1170
+ username = downloader_params['username']
1171
+ password = downloader_params['password']
1172
+ elif downloader_params.get('usenetrc', False):
1173
+ try:
1174
+ info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1175
+ if info is not None:
1176
+ username = info[0]
1177
+ password = info[2]
1178
+ else:
1179
+ raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1180
+ except (IOError, netrc.NetrcParseError), err:
1181
+ self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1182
+ return
1183
+
1184
+ # Set language
1185
+ request = urllib2.Request(self._LANG_URL)
1186
+ try:
1187
+ self.report_lang()
1188
+ urllib2.urlopen(request).read()
1189
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1190
+ self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1191
+ return
1192
+
1193
+ # No authentication to be performed
1194
+ if username is None:
1195
+ return
1196
+
1197
+ # Log in
1198
+ login_form = {
1199
+ 'current_form': 'loginForm',
1200
+ 'next': '/',
1201
+ 'action_login': 'Log In',
1202
+ 'username': username,
1203
+ 'password': password,
1204
+ }
1205
+ request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1206
+ try:
1207
+ self.report_login()
1208
+ login_results = urllib2.urlopen(request).read()
1209
+ if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1210
+ self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1211
+ return
1212
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1213
+ self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1214
+ return
1215
+
1216
+ # Confirm age
1217
+ age_form = {
1218
+ 'next_url': '/',
1219
+ 'action_confirm': 'Confirm',
1220
+ }
1221
+ request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1222
+ try:
1223
+ self.report_age_confirmation()
1224
+ age_results = urllib2.urlopen(request).read()
1225
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1226
+ self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1227
+ return
1228
+
1229
+ def _real_extract(self, url):
1230
+ # Extract video id from URL
1231
+ mobj = re.match(self._VALID_URL, url)
1232
+ if mobj is None:
1233
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1234
+ return
1235
+ video_id = mobj.group(2)
1236
+
1237
+ # Get video webpage
1238
+ self.report_video_webpage_download(video_id)
1239
+ request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1240
+ try:
1241
+ video_webpage = urllib2.urlopen(request).read()
1242
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1243
+ self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1244
+ return
1245
+
1246
+ # Attempt to extract SWF player URL
1247
+ mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1248
+ if mobj is not None:
1249
+ player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1250
+ else:
1251
+ player_url = None
1252
+
1253
+ # Get video info
1254
+ self.report_video_info_webpage_download(video_id)
1255
+ for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1256
+ video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1257
+ % (video_id, el_type))
1258
+ request = urllib2.Request(video_info_url)
1259
+ try:
1260
+ video_info_webpage = urllib2.urlopen(request).read()
1261
+ video_info = parse_qs(video_info_webpage)
1262
+ if 'token' in video_info:
1263
+ break
1264
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1265
+ self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1266
+ return
1267
+ if 'token' not in video_info:
1268
+ if 'reason' in video_info:
1269
+ self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1270
+ else:
1271
+ self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1272
+ return
1273
+
1274
+ # Start extracting information
1275
+ self.report_information_extraction(video_id)
1276
+
1277
+ # uploader
1278
+ if 'author' not in video_info:
1279
+ self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1280
+ return
1281
+ video_uploader = urllib.unquote_plus(video_info['author'][0])
1282
+
1283
+ # title
1284
+ if 'title' not in video_info:
1285
+ self._downloader.trouble(u'ERROR: unable to extract video title')
1286
+ return
1287
+ video_title = urllib.unquote_plus(video_info['title'][0])
1288
+ video_title = video_title.decode('utf-8')
1289
+ video_title = sanitize_title(video_title)
1290
+
1291
+ # simplified title
1292
+ simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1293
+ simple_title = simple_title.strip(ur'_')
1294
+
1295
+ # thumbnail image
1296
+ if 'thumbnail_url' not in video_info:
1297
+ self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1298
+ video_thumbnail = ''
1299
+ else: # don't panic if we can't find it
1300
+ video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1301
+
1302
+ # upload date
1303
+ upload_date = u'NA'
1304
+ mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1305
+ if mobj is not None:
1306
+ upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1307
+ format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1308
+ for expression in format_expressions:
1309
+ try:
1310
+ upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1311
+ except:
1312
+ pass
1313
+
1314
+ # description
1315
+ try:
1316
+ lxml.etree
1317
+ except NameError:
1318
+ video_description = u'No description available.'
1319
+ if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1320
+ mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1321
+ if mobj is not None:
1322
+ video_description = mobj.group(1).decode('utf-8')
1323
+ else:
1324
+ html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1325
+ vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1326
+ video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1327
+ # TODO use another parser
1328
+
1329
+ # token
1330
+ video_token = urllib.unquote_plus(video_info['token'][0])
1331
+
1332
+ # Decide which formats to download
1333
+ req_format = self._downloader.params.get('format', None)
1334
+
1335
+ if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1336
+ self.report_rtmp_download()
1337
+ video_url_list = [(None, video_info['conn'][0])]
1338
+ elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1339
+ url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1340
+ url_data = [parse_qs(uds) for uds in url_data_strs]
1341
+ url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1342
+ url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1343
+
1344
+ format_limit = self._downloader.params.get('format_limit', None)
1345
+ if format_limit is not None and format_limit in self._available_formats:
1346
+ format_list = self._available_formats[self._available_formats.index(format_limit):]
1347
+ else:
1348
+ format_list = self._available_formats
1349
+ existing_formats = [x for x in format_list if x in url_map]
1350
+ if len(existing_formats) == 0:
1351
+ self._downloader.trouble(u'ERROR: no known formats available for video')
1352
+ return
1353
+ if self._downloader.params.get('listformats', None):
1354
+ self._print_formats(existing_formats)
1355
+ return
1356
+ if req_format is None or req_format == 'best':
1357
+ video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1358
+ elif req_format == 'worst':
1359
+ video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1360
+ elif req_format in ('-1', 'all'):
1361
+ video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1362
+ else:
1363
+ # Specific formats. We pick the first in a slash-delimeted sequence.
1364
+ # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1365
+ req_formats = req_format.split('/')
1366
+ video_url_list = None
1367
+ for rf in req_formats:
1368
+ if rf in url_map:
1369
+ video_url_list = [(rf, url_map[rf])]
1370
+ break
1371
+ if video_url_list is None:
1372
+ self._downloader.trouble(u'ERROR: requested format not available')
1373
+ return
1374
+ else:
1375
+ self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1376
+ return
1377
+
1378
+ for format_param, video_real_url in video_url_list:
1379
+ # At this point we have a new video
1380
+ self._downloader.increment_downloads()
1381
+
1382
+ # Extension
1383
+ video_extension = self._video_extensions.get(format_param, 'flv')
1384
+
1385
+ try:
1386
+ # Process video information
1387
+ self._downloader.process_info({
1388
+ 'id': video_id.decode('utf-8'),
1389
+ 'url': video_real_url.decode('utf-8'),
1390
+ 'uploader': video_uploader.decode('utf-8'),
1391
+ 'upload_date': upload_date,
1392
+ 'title': video_title,
1393
+ 'stitle': simple_title,
1394
+ 'ext': video_extension.decode('utf-8'),
1395
+ 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1396
+ 'thumbnail': video_thumbnail.decode('utf-8'),
1397
+ 'description': video_description,
1398
+ 'player_url': player_url,
1399
+ })
1400
+ except UnavailableVideoError, err:
1401
+ self._downloader.trouble(u'\nERROR: unable to download video')
1402
+
1403
+
1404
+ class MetacafeIE(InfoExtractor):
1405
+ """Information Extractor for metacafe.com."""
1406
+
1407
+ _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1408
+ _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1409
+ _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1410
+ _youtube_ie = None
1411
+ IE_NAME = u'metacafe'
1412
+
1413
+ def __init__(self, youtube_ie, downloader=None):
1414
+ InfoExtractor.__init__(self, downloader)
1415
+ self._youtube_ie = youtube_ie
1416
+
1417
+ def report_disclaimer(self):
1418
+ """Report disclaimer retrieval."""
1419
+ self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1420
+
1421
+ def report_age_confirmation(self):
1422
+ """Report attempt to confirm age."""
1423
+ self._downloader.to_screen(u'[metacafe] Confirming age')
1424
+
1425
+ def report_download_webpage(self, video_id):
1426
+ """Report webpage download."""
1427
+ self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1428
+
1429
+ def report_extraction(self, video_id):
1430
+ """Report information extraction."""
1431
+ self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1432
+
1433
+ def _real_initialize(self):
1434
+ # Retrieve disclaimer
1435
+ request = urllib2.Request(self._DISCLAIMER)
1436
+ try:
1437
+ self.report_disclaimer()
1438
+ disclaimer = urllib2.urlopen(request).read()
1439
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1440
+ self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1441
+ return
1442
+
1443
+ # Confirm age
1444
+ disclaimer_form = {
1445
+ 'filters': '0',
1446
+ 'submit': "Continue - I'm over 18",
1447
+ }
1448
+ request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1449
+ try:
1450
+ self.report_age_confirmation()
1451
+ disclaimer = urllib2.urlopen(request).read()
1452
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1453
+ self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1454
+ return
1455
+
1456
+ def _real_extract(self, url):
1457
+ # Extract id and simplified title from URL
1458
+ mobj = re.match(self._VALID_URL, url)
1459
+ if mobj is None:
1460
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1461
+ return
1462
+
1463
+ video_id = mobj.group(1)
1464
+
1465
+ # Check if video comes from YouTube
1466
+ mobj2 = re.match(r'^yt-(.*)$', video_id)
1467
+ if mobj2 is not None:
1468
+ self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1469
+ return
1470
+
1471
+ # At this point we have a new video
1472
+ self._downloader.increment_downloads()
1473
+
1474
+ simple_title = mobj.group(2).decode('utf-8')
1475
+
1476
+ # Retrieve video webpage to extract further information
1477
+ request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1478
+ try:
1479
+ self.report_download_webpage(video_id)
1480
+ webpage = urllib2.urlopen(request).read()
1481
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1482
+ self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1483
+ return
1484
+
1485
+ # Extract URL, uploader and title from webpage
1486
+ self.report_extraction(video_id)
1487
+ mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1488
+ if mobj is not None:
1489
+ mediaURL = urllib.unquote(mobj.group(1))
1490
+ video_extension = mediaURL[-3:]
1491
+
1492
+ # Extract gdaKey if available
1493
+ mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1494
+ if mobj is None:
1495
+ video_url = mediaURL
1496
+ else:
1497
+ gdaKey = mobj.group(1)
1498
+ video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1499
+ else:
1500
+ mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1501
+ if mobj is None:
1502
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
1503
+ return
1504
+ vardict = parse_qs(mobj.group(1))
1505
+ if 'mediaData' not in vardict:
1506
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
1507
+ return
1508
+ mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1509
+ if mobj is None:
1510
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
1511
+ return
1512
+ mediaURL = mobj.group(1).replace('\\/', '/')
1513
+ video_extension = mediaURL[-3:]
1514
+ video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1515
+
1516
+ mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1517
+ if mobj is None:
1518
+ self._downloader.trouble(u'ERROR: unable to extract title')
1519
+ return
1520
+ video_title = mobj.group(1).decode('utf-8')
1521
+ video_title = sanitize_title(video_title)
1522
+
1523
+ mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1524
+ if mobj is None:
1525
+ self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1526
+ return
1527
+ video_uploader = mobj.group(1)
1528
+
1529
+ try:
1530
+ # Process video information
1531
+ self._downloader.process_info({
1532
+ 'id': video_id.decode('utf-8'),
1533
+ 'url': video_url.decode('utf-8'),
1534
+ 'uploader': video_uploader.decode('utf-8'),
1535
+ 'upload_date': u'NA',
1536
+ 'title': video_title,
1537
+ 'stitle': simple_title,
1538
+ 'ext': video_extension.decode('utf-8'),
1539
+ 'format': u'NA',
1540
+ 'player_url': None,
1541
+ })
1542
+ except UnavailableVideoError:
1543
+ self._downloader.trouble(u'\nERROR: unable to download video')
1544
+
1545
+
1546
+ class DailymotionIE(InfoExtractor):
1547
+ """Information Extractor for Dailymotion"""
1548
+
1549
+ _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1550
+ IE_NAME = u'dailymotion'
1551
+
1552
+ def __init__(self, downloader=None):
1553
+ InfoExtractor.__init__(self, downloader)
1554
+
1555
+ def report_download_webpage(self, video_id):
1556
+ """Report webpage download."""
1557
+ self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1558
+
1559
+ def report_extraction(self, video_id):
1560
+ """Report information extraction."""
1561
+ self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1562
+
1563
+ def _real_initialize(self):
1564
+ return
1565
+
1566
+ def _real_extract(self, url):
1567
+ # Extract id and simplified title from URL
1568
+ mobj = re.match(self._VALID_URL, url)
1569
+ if mobj is None:
1570
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1571
+ return
1572
+
1573
+ # At this point we have a new video
1574
+ self._downloader.increment_downloads()
1575
+ video_id = mobj.group(1)
1576
+
1577
+ simple_title = mobj.group(2).decode('utf-8')
1578
+ video_extension = 'flv'
1579
+
1580
+ # Retrieve video webpage to extract further information
1581
+ request = urllib2.Request(url)
1582
+ request.add_header('Cookie', 'family_filter=off')
1583
+ try:
1584
+ self.report_download_webpage(video_id)
1585
+ webpage = urllib2.urlopen(request).read()
1586
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1587
+ self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1588
+ return
1589
+
1590
+ # Extract URL, uploader and title from webpage
1591
+ self.report_extraction(video_id)
1592
+ mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1593
+ if mobj is None:
1594
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
1595
+ return
1596
+ sequence = urllib.unquote(mobj.group(1))
1597
+ mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1598
+ if mobj is None:
1599
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
1600
+ return
1601
+ mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1602
+
1603
+ # if needed add http://www.dailymotion.com/ if relative URL
1604
+
1605
+ video_url = mediaURL
1606
+
1607
+ mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1608
+ if mobj is None:
1609
+ self._downloader.trouble(u'ERROR: unable to extract title')
1610
+ return
1611
+ video_title = mobj.group(1).decode('utf-8')
1612
+ video_title = sanitize_title(video_title)
1613
+
1614
+ mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1615
+ if mobj is None:
1616
+ self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1617
+ return
1618
+ video_uploader = mobj.group(1)
1619
+
1620
+ try:
1621
+ # Process video information
1622
+ self._downloader.process_info({
1623
+ 'id': video_id.decode('utf-8'),
1624
+ 'url': video_url.decode('utf-8'),
1625
+ 'uploader': video_uploader.decode('utf-8'),
1626
+ 'upload_date': u'NA',
1627
+ 'title': video_title,
1628
+ 'stitle': simple_title,
1629
+ 'ext': video_extension.decode('utf-8'),
1630
+ 'format': u'NA',
1631
+ 'player_url': None,
1632
+ })
1633
+ except UnavailableVideoError:
1634
+ self._downloader.trouble(u'\nERROR: unable to download video')
1635
+
1636
+
1637
+ class GoogleIE(InfoExtractor):
1638
+ """Information extractor for video.google.com."""
1639
+
1640
+ _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1641
+ IE_NAME = u'video.google'
1642
+
1643
+ def __init__(self, downloader=None):
1644
+ InfoExtractor.__init__(self, downloader)
1645
+
1646
+ def report_download_webpage(self, video_id):
1647
+ """Report webpage download."""
1648
+ self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1649
+
1650
+ def report_extraction(self, video_id):
1651
+ """Report information extraction."""
1652
+ self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1653
+
1654
+ def _real_initialize(self):
1655
+ return
1656
+
1657
+ def _real_extract(self, url):
1658
+ # Extract id from URL
1659
+ mobj = re.match(self._VALID_URL, url)
1660
+ if mobj is None:
1661
+ self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1662
+ return
1663
+
1664
+ # At this point we have a new video
1665
+ self._downloader.increment_downloads()
1666
+ video_id = mobj.group(1)
1667
+
1668
+ video_extension = 'mp4'
1669
+
1670
+ # Retrieve video webpage to extract further information
1671
+ request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1672
+ try:
1673
+ self.report_download_webpage(video_id)
1674
+ webpage = urllib2.urlopen(request).read()
1675
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1677
+ return
1678
+
1679
+ # Extract URL, uploader, and title from webpage
1680
+ self.report_extraction(video_id)
1681
+ mobj = re.search(r"download_url:'([^']+)'", webpage)
1682
+ if mobj is None:
1683
+ video_extension = 'flv'
1684
+ mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1685
+ if mobj is None:
1686
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
1687
+ return
1688
+ mediaURL = urllib.unquote(mobj.group(1))
1689
+ mediaURL = mediaURL.replace('\\x3d', '\x3d')
1690
+ mediaURL = mediaURL.replace('\\x26', '\x26')
1691
+
1692
+ video_url = mediaURL
1693
+
1694
+ mobj = re.search(r'<title>(.*)</title>', webpage)
1695
+ if mobj is None:
1696
+ self._downloader.trouble(u'ERROR: unable to extract title')
1697
+ return
1698
+ video_title = mobj.group(1).decode('utf-8')
1699
+ video_title = sanitize_title(video_title)
1700
+ simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1701
+
1702
+ # Extract video description
1703
+ mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1704
+ if mobj is None:
1705
+ self._downloader.trouble(u'ERROR: unable to extract video description')
1706
+ return
1707
+ video_description = mobj.group(1).decode('utf-8')
1708
+ if not video_description:
1709
+ video_description = 'No description available.'
1710
+
1711
+ # Extract video thumbnail
1712
+ if self._downloader.params.get('forcethumbnail', False):
1713
+ request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1714
+ try:
1715
+ webpage = urllib2.urlopen(request).read()
1716
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1717
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1718
+ return
1719
+ mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1720
+ if mobj is None:
1721
+ self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1722
+ return
1723
+ video_thumbnail = mobj.group(1)
1724
+ else: # we need something to pass to process_info
1725
+ video_thumbnail = ''
1726
+
1727
+ try:
1728
+ # Process video information
1729
+ self._downloader.process_info({
1730
+ 'id': video_id.decode('utf-8'),
1731
+ 'url': video_url.decode('utf-8'),
1732
+ 'uploader': u'NA',
1733
+ 'upload_date': u'NA',
1734
+ 'title': video_title,
1735
+ 'stitle': simple_title,
1736
+ 'ext': video_extension.decode('utf-8'),
1737
+ 'format': u'NA',
1738
+ 'player_url': None,
1739
+ })
1740
+ except UnavailableVideoError:
1741
+ self._downloader.trouble(u'\nERROR: unable to download video')
1742
+
1743
+
1744
+ class PhotobucketIE(InfoExtractor):
1745
+ """Information extractor for photobucket.com."""
1746
+
1747
+ _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1748
+ IE_NAME = u'photobucket'
1749
+
1750
+ def __init__(self, downloader=None):
1751
+ InfoExtractor.__init__(self, downloader)
1752
+
1753
+ def report_download_webpage(self, video_id):
1754
+ """Report webpage download."""
1755
+ self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1756
+
1757
+ def report_extraction(self, video_id):
1758
+ """Report information extraction."""
1759
+ self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1760
+
1761
+ def _real_initialize(self):
1762
+ return
1763
+
1764
+ def _real_extract(self, url):
1765
+ # Extract id from URL
1766
+ mobj = re.match(self._VALID_URL, url)
1767
+ if mobj is None:
1768
+ self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1769
+ return
1770
+
1771
+ # At this point we have a new video
1772
+ self._downloader.increment_downloads()
1773
+ video_id = mobj.group(1)
1774
+
1775
+ video_extension = 'flv'
1776
+
1777
+ # Retrieve video webpage to extract further information
1778
+ request = urllib2.Request(url)
1779
+ try:
1780
+ self.report_download_webpage(video_id)
1781
+ webpage = urllib2.urlopen(request).read()
1782
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1783
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1784
+ return
1785
+
1786
+ # Extract URL, uploader, and title from webpage
1787
+ self.report_extraction(video_id)
1788
+ mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1789
+ if mobj is None:
1790
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
1791
+ return
1792
+ mediaURL = urllib.unquote(mobj.group(1))
1793
+
1794
+ video_url = mediaURL
1795
+
1796
+ mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1797
+ if mobj is None:
1798
+ self._downloader.trouble(u'ERROR: unable to extract title')
1799
+ return
1800
+ video_title = mobj.group(1).decode('utf-8')
1801
+ video_title = sanitize_title(video_title)
1802
+ simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1803
+
1804
+ video_uploader = mobj.group(2).decode('utf-8')
1805
+
1806
+ try:
1807
+ # Process video information
1808
+ self._downloader.process_info({
1809
+ 'id': video_id.decode('utf-8'),
1810
+ 'url': video_url.decode('utf-8'),
1811
+ 'uploader': video_uploader,
1812
+ 'upload_date': u'NA',
1813
+ 'title': video_title,
1814
+ 'stitle': simple_title,
1815
+ 'ext': video_extension.decode('utf-8'),
1816
+ 'format': u'NA',
1817
+ 'player_url': None,
1818
+ })
1819
+ except UnavailableVideoError:
1820
+ self._downloader.trouble(u'\nERROR: unable to download video')
1821
+
1822
+
1823
+ class YahooIE(InfoExtractor):
1824
+ """Information extractor for video.yahoo.com."""
1825
+
1826
+ # _VALID_URL matches all Yahoo! Video URLs
1827
+ # _VPAGE_URL matches only the extractable '/watch/' URLs
1828
+ _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1829
+ _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1830
+ IE_NAME = u'video.yahoo'
1831
+
1832
+ def __init__(self, downloader=None):
1833
+ InfoExtractor.__init__(self, downloader)
1834
+
1835
+ def report_download_webpage(self, video_id):
1836
+ """Report webpage download."""
1837
+ self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1838
+
1839
+ def report_extraction(self, video_id):
1840
+ """Report information extraction."""
1841
+ self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1842
+
1843
+ def _real_initialize(self):
1844
+ return
1845
+
1846
+ def _real_extract(self, url, new_video=True):
1847
+ # Extract ID from URL
1848
+ mobj = re.match(self._VALID_URL, url)
1849
+ if mobj is None:
1850
+ self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1851
+ return
1852
+
1853
+ # At this point we have a new video
1854
+ self._downloader.increment_downloads()
1855
+ video_id = mobj.group(2)
1856
+ video_extension = 'flv'
1857
+
1858
+ # Rewrite valid but non-extractable URLs as
1859
+ # extractable English language /watch/ URLs
1860
+ if re.match(self._VPAGE_URL, url) is None:
1861
+ request = urllib2.Request(url)
1862
+ try:
1863
+ webpage = urllib2.urlopen(request).read()
1864
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1865
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1866
+ return
1867
+
1868
+ mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1869
+ if mobj is None:
1870
+ self._downloader.trouble(u'ERROR: Unable to extract id field')
1871
+ return
1872
+ yahoo_id = mobj.group(1)
1873
+
1874
+ mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1875
+ if mobj is None:
1876
+ self._downloader.trouble(u'ERROR: Unable to extract vid field')
1877
+ return
1878
+ yahoo_vid = mobj.group(1)
1879
+
1880
+ url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1881
+ return self._real_extract(url, new_video=False)
1882
+
1883
+ # Retrieve video webpage to extract further information
1884
+ request = urllib2.Request(url)
1885
+ try:
1886
+ self.report_download_webpage(video_id)
1887
+ webpage = urllib2.urlopen(request).read()
1888
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1889
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1890
+ return
1891
+
1892
+ # Extract uploader and title from webpage
1893
+ self.report_extraction(video_id)
1894
+ mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1895
+ if mobj is None:
1896
+ self._downloader.trouble(u'ERROR: unable to extract video title')
1897
+ return
1898
+ video_title = mobj.group(1).decode('utf-8')
1899
+ simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1900
+
1901
+ mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1902
+ if mobj is None:
1903
+ self._downloader.trouble(u'ERROR: unable to extract video uploader')
1904
+ return
1905
+ video_uploader = mobj.group(1).decode('utf-8')
1906
+
1907
+ # Extract video thumbnail
1908
+ mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1909
+ if mobj is None:
1910
+ self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1911
+ return
1912
+ video_thumbnail = mobj.group(1).decode('utf-8')
1913
+
1914
+ # Extract video description
1915
+ mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1916
+ if mobj is None:
1917
+ self._downloader.trouble(u'ERROR: unable to extract video description')
1918
+ return
1919
+ video_description = mobj.group(1).decode('utf-8')
1920
+ if not video_description:
1921
+ video_description = 'No description available.'
1922
+
1923
+ # Extract video height and width
1924
+ mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1925
+ if mobj is None:
1926
+ self._downloader.trouble(u'ERROR: unable to extract video height')
1927
+ return
1928
+ yv_video_height = mobj.group(1)
1929
+
1930
+ mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1931
+ if mobj is None:
1932
+ self._downloader.trouble(u'ERROR: unable to extract video width')
1933
+ return
1934
+ yv_video_width = mobj.group(1)
1935
+
1936
+ # Retrieve video playlist to extract media URL
1937
+ # I'm not completely sure what all these options are, but we
1938
+ # seem to need most of them, otherwise the server sends a 401.
1939
+ yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1940
+ yv_bitrate = '700' # according to Wikipedia this is hard-coded
1941
+ request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1942
+ '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1943
+ '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1944
+ try:
1945
+ self.report_download_webpage(video_id)
1946
+ webpage = urllib2.urlopen(request).read()
1947
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1948
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1949
+ return
1950
+
1951
+ # Extract media URL from playlist XML
1952
+ mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1953
+ if mobj is None:
1954
+ self._downloader.trouble(u'ERROR: Unable to extract media URL')
1955
+ return
1956
+ video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1957
+ video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1958
+
1959
+ try:
1960
+ # Process video information
1961
+ self._downloader.process_info({
1962
+ 'id': video_id.decode('utf-8'),
1963
+ 'url': video_url,
1964
+ 'uploader': video_uploader,
1965
+ 'upload_date': u'NA',
1966
+ 'title': video_title,
1967
+ 'stitle': simple_title,
1968
+ 'ext': video_extension.decode('utf-8'),
1969
+ 'thumbnail': video_thumbnail.decode('utf-8'),
1970
+ 'description': video_description,
1971
+ 'thumbnail': video_thumbnail,
1972
+ 'player_url': None,
1973
+ })
1974
+ except UnavailableVideoError:
1975
+ self._downloader.trouble(u'\nERROR: unable to download video')
1976
+
1977
+
1978
+ class VimeoIE(InfoExtractor):
1979
+ """Information extractor for vimeo.com."""
1980
+
1981
+ # _VALID_URL matches Vimeo URLs
1982
+ _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1983
+ IE_NAME = u'vimeo'
1984
+
1985
+ def __init__(self, downloader=None):
1986
+ InfoExtractor.__init__(self, downloader)
1987
+
1988
+ def report_download_webpage(self, video_id):
1989
+ """Report webpage download."""
1990
+ self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1991
+
1992
+ def report_extraction(self, video_id):
1993
+ """Report information extraction."""
1994
+ self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1995
+
1996
+ def _real_initialize(self):
1997
+ return
1998
+
1999
+ def _real_extract(self, url, new_video=True):
2000
+ # Extract ID from URL
2001
+ mobj = re.match(self._VALID_URL, url)
2002
+ if mobj is None:
2003
+ self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2004
+ return
2005
+
2006
+ # At this point we have a new video
2007
+ self._downloader.increment_downloads()
2008
+ video_id = mobj.group(1)
2009
+
2010
+ # Retrieve video webpage to extract further information
2011
+ request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2012
+ try:
2013
+ self.report_download_webpage(video_id)
2014
+ webpage = urllib2.urlopen(request).read()
2015
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2016
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2017
+ return
2018
+
2019
+ # Now we begin extracting as much information as we can from what we
2020
+ # retrieved. First we extract the information common to all extractors,
2021
+ # and latter we extract those that are Vimeo specific.
2022
+ self.report_extraction(video_id)
2023
+
2024
+ # Extract title
2025
+ mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2026
+ if mobj is None:
2027
+ self._downloader.trouble(u'ERROR: unable to extract video title')
2028
+ return
2029
+ video_title = mobj.group(1).decode('utf-8')
2030
+ simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2031
+
2032
+ # Extract uploader
2033
+ mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2034
+ if mobj is None:
2035
+ self._downloader.trouble(u'ERROR: unable to extract video uploader')
2036
+ return
2037
+ video_uploader = mobj.group(1).decode('utf-8')
2038
+
2039
+ # Extract video thumbnail
2040
+ mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2041
+ if mobj is None:
2042
+ self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2043
+ return
2044
+ video_thumbnail = mobj.group(1).decode('utf-8')
2045
+
2046
+ # # Extract video description
2047
+ # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2048
+ # if mobj is None:
2049
+ # self._downloader.trouble(u'ERROR: unable to extract video description')
2050
+ # return
2051
+ # video_description = mobj.group(1).decode('utf-8')
2052
+ # if not video_description: video_description = 'No description available.'
2053
+ video_description = 'Foo.'
2054
+
2055
+ # Vimeo specific: extract request signature
2056
+ mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2057
+ if mobj is None:
2058
+ self._downloader.trouble(u'ERROR: unable to extract request signature')
2059
+ return
2060
+ sig = mobj.group(1).decode('utf-8')
2061
+
2062
+ # Vimeo specific: extract video quality information
2063
+ mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2064
+ if mobj is None:
2065
+ self._downloader.trouble(u'ERROR: unable to extract video quality information')
2066
+ return
2067
+ quality = mobj.group(1).decode('utf-8')
2068
+
2069
+ if int(quality) == 1:
2070
+ quality = 'hd'
2071
+ else:
2072
+ quality = 'sd'
2073
+
2074
+ # Vimeo specific: Extract request signature expiration
2075
+ mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2076
+ if mobj is None:
2077
+ self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2078
+ return
2079
+ sig_exp = mobj.group(1).decode('utf-8')
2080
+
2081
+ video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2082
+
2083
+ try:
2084
+ # Process video information
2085
+ self._downloader.process_info({
2086
+ 'id': video_id.decode('utf-8'),
2087
+ 'url': video_url,
2088
+ 'uploader': video_uploader,
2089
+ 'upload_date': u'NA',
2090
+ 'title': video_title,
2091
+ 'stitle': simple_title,
2092
+ 'ext': u'mp4',
2093
+ 'thumbnail': video_thumbnail.decode('utf-8'),
2094
+ 'description': video_description,
2095
+ 'thumbnail': video_thumbnail,
2096
+ 'description': video_description,
2097
+ 'player_url': None,
2098
+ })
2099
+ except UnavailableVideoError:
2100
+ self._downloader.trouble(u'ERROR: unable to download video')
2101
+
2102
+
2103
+ class GenericIE(InfoExtractor):
2104
+ """Generic last-resort information extractor."""
2105
+
2106
+ _VALID_URL = r'.*'
2107
+ IE_NAME = u'generic'
2108
+
2109
+ def __init__(self, downloader=None):
2110
+ InfoExtractor.__init__(self, downloader)
2111
+
2112
+ def report_download_webpage(self, video_id):
2113
+ """Report webpage download."""
2114
+ self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2115
+ self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2116
+
2117
+ def report_extraction(self, video_id):
2118
+ """Report information extraction."""
2119
+ self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2120
+
2121
+ def _real_initialize(self):
2122
+ return
2123
+
2124
+ def _real_extract(self, url):
2125
+ # At this point we have a new video
2126
+ self._downloader.increment_downloads()
2127
+
2128
+ video_id = url.split('/')[-1]
2129
+ request = urllib2.Request(url)
2130
+ try:
2131
+ self.report_download_webpage(video_id)
2132
+ webpage = urllib2.urlopen(request).read()
2133
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2134
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2135
+ return
2136
+ except ValueError, err:
2137
+ # since this is the last-resort InfoExtractor, if
2138
+ # this error is thrown, it'll be thrown here
2139
+ self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2140
+ return
2141
+
2142
+ self.report_extraction(video_id)
2143
+ # Start with something easy: JW Player in SWFObject
2144
+ mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2145
+ if mobj is None:
2146
+ # Broaden the search a little bit
2147
+ mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2148
+ if mobj is None:
2149
+ self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2150
+ return
2151
+
2152
+ # It's possible that one of the regexes
2153
+ # matched, but returned an empty group:
2154
+ if mobj.group(1) is None:
2155
+ self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2156
+ return
2157
+
2158
+ video_url = urllib.unquote(mobj.group(1))
2159
+ video_id = os.path.basename(video_url)
2160
+
2161
+ # here's a fun little line of code for you:
2162
+ video_extension = os.path.splitext(video_id)[1][1:]
2163
+ video_id = os.path.splitext(video_id)[0]
2164
+
2165
+ # it's tempting to parse this further, but you would
2166
+ # have to take into account all the variations like
2167
+ # Video Title - Site Name
2168
+ # Site Name | Video Title
2169
+ # Video Title - Tagline | Site Name
2170
+ # and so on and so forth; it's just not practical
2171
+ mobj = re.search(r'<title>(.*)</title>', webpage)
2172
+ if mobj is None:
2173
+ self._downloader.trouble(u'ERROR: unable to extract title')
2174
+ return
2175
+ video_title = mobj.group(1).decode('utf-8')
2176
+ video_title = sanitize_title(video_title)
2177
+ simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2178
+
2179
+ # video uploader is domain name
2180
+ mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2181
+ if mobj is None:
2182
+ self._downloader.trouble(u'ERROR: unable to extract title')
2183
+ return
2184
+ video_uploader = mobj.group(1).decode('utf-8')
2185
+
2186
+ try:
2187
+ # Process video information
2188
+ self._downloader.process_info({
2189
+ 'id': video_id.decode('utf-8'),
2190
+ 'url': video_url.decode('utf-8'),
2191
+ 'uploader': video_uploader,
2192
+ 'upload_date': u'NA',
2193
+ 'title': video_title,
2194
+ 'stitle': simple_title,
2195
+ 'ext': video_extension.decode('utf-8'),
2196
+ 'format': u'NA',
2197
+ 'player_url': None,
2198
+ })
2199
+ except UnavailableVideoError, err:
2200
+ self._downloader.trouble(u'\nERROR: unable to download video')
2201
+
2202
+
2203
+ class YoutubeSearchIE(InfoExtractor):
2204
+ """Information Extractor for YouTube search queries."""
2205
+ _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2206
+ _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2207
+ _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2208
+ _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2209
+ _youtube_ie = None
2210
+ _max_youtube_results = 1000
2211
+ IE_NAME = u'youtube:search'
2212
+
2213
+ def __init__(self, youtube_ie, downloader=None):
2214
+ InfoExtractor.__init__(self, downloader)
2215
+ self._youtube_ie = youtube_ie
2216
+
2217
+ def report_download_page(self, query, pagenum):
2218
+ """Report attempt to download playlist page with given number."""
2219
+ query = query.decode(preferredencoding())
2220
+ self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2221
+
2222
+ def _real_initialize(self):
2223
+ self._youtube_ie.initialize()
2224
+
2225
+ def _real_extract(self, query):
2226
+ mobj = re.match(self._VALID_URL, query)
2227
+ if mobj is None:
2228
+ self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2229
+ return
2230
+
2231
+ prefix, query = query.split(':')
2232
+ prefix = prefix[8:]
2233
+ query = query.encode('utf-8')
2234
+ if prefix == '':
2235
+ self._download_n_results(query, 1)
2236
+ return
2237
+ elif prefix == 'all':
2238
+ self._download_n_results(query, self._max_youtube_results)
2239
+ return
2240
+ else:
2241
+ try:
2242
+ n = long(prefix)
2243
+ if n <= 0:
2244
+ self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2245
+ return
2246
+ elif n > self._max_youtube_results:
2247
+ self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2248
+ n = self._max_youtube_results
2249
+ self._download_n_results(query, n)
2250
+ return
2251
+ except ValueError: # parsing prefix as integer fails
2252
+ self._download_n_results(query, 1)
2253
+ return
2254
+
2255
+ def _download_n_results(self, query, n):
2256
+ """Downloads a specified number of results for a query"""
2257
+
2258
+ video_ids = []
2259
+ already_seen = set()
2260
+ pagenum = 1
2261
+
2262
+ while True:
2263
+ self.report_download_page(query, pagenum)
2264
+ result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2265
+ request = urllib2.Request(result_url)
2266
+ try:
2267
+ page = urllib2.urlopen(request).read()
2268
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2269
+ self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2270
+ return
2271
+
2272
+ # Extract video identifiers
2273
+ for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2274
+ video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2275
+ if video_id not in already_seen:
2276
+ video_ids.append(video_id)
2277
+ already_seen.add(video_id)
2278
+ if len(video_ids) == n:
2279
+ # Specified n videos reached
2280
+ for id in video_ids:
2281
+ self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2282
+ return
2283
+
2284
+ if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2285
+ for id in video_ids:
2286
+ self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2287
+ return
2288
+
2289
+ pagenum = pagenum + 1
2290
+
2291
+
2292
+ class GoogleSearchIE(InfoExtractor):
2293
+ """Information Extractor for Google Video search queries."""
2294
+ _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2295
+ _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2296
+ _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2297
+ _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2298
+ _google_ie = None
2299
+ _max_google_results = 1000
2300
+ IE_NAME = u'video.google:search'
2301
+
2302
+ def __init__(self, google_ie, downloader=None):
2303
+ InfoExtractor.__init__(self, downloader)
2304
+ self._google_ie = google_ie
2305
+
2306
+ def report_download_page(self, query, pagenum):
2307
+ """Report attempt to download playlist page with given number."""
2308
+ query = query.decode(preferredencoding())
2309
+ self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2310
+
2311
+ def _real_initialize(self):
2312
+ self._google_ie.initialize()
2313
+
2314
+ def _real_extract(self, query):
2315
+ mobj = re.match(self._VALID_URL, query)
2316
+ if mobj is None:
2317
+ self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2318
+ return
2319
+
2320
+ prefix, query = query.split(':')
2321
+ prefix = prefix[8:]
2322
+ query = query.encode('utf-8')
2323
+ if prefix == '':
2324
+ self._download_n_results(query, 1)
2325
+ return
2326
+ elif prefix == 'all':
2327
+ self._download_n_results(query, self._max_google_results)
2328
+ return
2329
+ else:
2330
+ try:
2331
+ n = long(prefix)
2332
+ if n <= 0:
2333
+ self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2334
+ return
2335
+ elif n > self._max_google_results:
2336
+ self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2337
+ n = self._max_google_results
2338
+ self._download_n_results(query, n)
2339
+ return
2340
+ except ValueError: # parsing prefix as integer fails
2341
+ self._download_n_results(query, 1)
2342
+ return
2343
+
2344
+ def _download_n_results(self, query, n):
2345
+ """Downloads a specified number of results for a query"""
2346
+
2347
+ video_ids = []
2348
+ already_seen = set()
2349
+ pagenum = 1
2350
+
2351
+ while True:
2352
+ self.report_download_page(query, pagenum)
2353
+ result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2354
+ request = urllib2.Request(result_url)
2355
+ try:
2356
+ page = urllib2.urlopen(request).read()
2357
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2358
+ self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2359
+ return
2360
+
2361
+ # Extract video identifiers
2362
+ for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2363
+ video_id = mobj.group(1)
2364
+ if video_id not in already_seen:
2365
+ video_ids.append(video_id)
2366
+ already_seen.add(video_id)
2367
+ if len(video_ids) == n:
2368
+ # Specified n videos reached
2369
+ for id in video_ids:
2370
+ self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2371
+ return
2372
+
2373
+ if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2374
+ for id in video_ids:
2375
+ self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2376
+ return
2377
+
2378
+ pagenum = pagenum + 1
2379
+
2380
+
2381
+ class YahooSearchIE(InfoExtractor):
2382
+ """Information Extractor for Yahoo! Video search queries."""
2383
+ _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2384
+ _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2385
+ _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2386
+ _MORE_PAGES_INDICATOR = r'\s*Next'
2387
+ _yahoo_ie = None
2388
+ _max_yahoo_results = 1000
2389
+ IE_NAME = u'video.yahoo:search'
2390
+
2391
+ def __init__(self, yahoo_ie, downloader=None):
2392
+ InfoExtractor.__init__(self, downloader)
2393
+ self._yahoo_ie = yahoo_ie
2394
+
2395
+ def report_download_page(self, query, pagenum):
2396
+ """Report attempt to download playlist page with given number."""
2397
+ query = query.decode(preferredencoding())
2398
+ self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2399
+
2400
+ def _real_initialize(self):
2401
+ self._yahoo_ie.initialize()
2402
+
2403
+ def _real_extract(self, query):
2404
+ mobj = re.match(self._VALID_URL, query)
2405
+ if mobj is None:
2406
+ self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2407
+ return
2408
+
2409
+ prefix, query = query.split(':')
2410
+ prefix = prefix[8:]
2411
+ query = query.encode('utf-8')
2412
+ if prefix == '':
2413
+ self._download_n_results(query, 1)
2414
+ return
2415
+ elif prefix == 'all':
2416
+ self._download_n_results(query, self._max_yahoo_results)
2417
+ return
2418
+ else:
2419
+ try:
2420
+ n = long(prefix)
2421
+ if n <= 0:
2422
+ self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2423
+ return
2424
+ elif n > self._max_yahoo_results:
2425
+ self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2426
+ n = self._max_yahoo_results
2427
+ self._download_n_results(query, n)
2428
+ return
2429
+ except ValueError: # parsing prefix as integer fails
2430
+ self._download_n_results(query, 1)
2431
+ return
2432
+
2433
+ def _download_n_results(self, query, n):
2434
+ """Downloads a specified number of results for a query"""
2435
+
2436
+ video_ids = []
2437
+ already_seen = set()
2438
+ pagenum = 1
2439
+
2440
+ while True:
2441
+ self.report_download_page(query, pagenum)
2442
+ result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2443
+ request = urllib2.Request(result_url)
2444
+ try:
2445
+ page = urllib2.urlopen(request).read()
2446
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2447
+ self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2448
+ return
2449
+
2450
+ # Extract video identifiers
2451
+ for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2452
+ video_id = mobj.group(1)
2453
+ if video_id not in already_seen:
2454
+ video_ids.append(video_id)
2455
+ already_seen.add(video_id)
2456
+ if len(video_ids) == n:
2457
+ # Specified n videos reached
2458
+ for id in video_ids:
2459
+ self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2460
+ return
2461
+
2462
+ if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2463
+ for id in video_ids:
2464
+ self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2465
+ return
2466
+
2467
+ pagenum = pagenum + 1
2468
+
2469
+
2470
+ class YoutubePlaylistIE(InfoExtractor):
2471
+ """Information Extractor for YouTube playlists."""
2472
+
2473
+ _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2474
+ _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2475
+ _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2476
+ _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2477
+ _youtube_ie = None
2478
+ IE_NAME = u'youtube:playlist'
2479
+
2480
+ def __init__(self, youtube_ie, downloader=None):
2481
+ InfoExtractor.__init__(self, downloader)
2482
+ self._youtube_ie = youtube_ie
2483
+
2484
+ def report_download_page(self, playlist_id, pagenum):
2485
+ """Report attempt to download playlist page with given number."""
2486
+ self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2487
+
2488
+ def _real_initialize(self):
2489
+ self._youtube_ie.initialize()
2490
+
2491
+ def _real_extract(self, url):
2492
+ # Extract playlist id
2493
+ mobj = re.match(self._VALID_URL, url)
2494
+ if mobj is None:
2495
+ self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2496
+ return
2497
+
2498
+ # Single video case
2499
+ if mobj.group(3) is not None:
2500
+ self._youtube_ie.extract(mobj.group(3))
2501
+ return
2502
+
2503
+ # Download playlist pages
2504
+ # prefix is 'p' as default for playlists but there are other types that need extra care
2505
+ playlist_prefix = mobj.group(1)
2506
+ if playlist_prefix == 'a':
2507
+ playlist_access = 'artist'
2508
+ else:
2509
+ playlist_prefix = 'p'
2510
+ playlist_access = 'view_play_list'
2511
+ playlist_id = mobj.group(2)
2512
+ video_ids = []
2513
+ pagenum = 1
2514
+
2515
+ while True:
2516
+ self.report_download_page(playlist_id, pagenum)
2517
+ request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2518
+ try:
2519
+ page = urllib2.urlopen(request).read()
2520
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2521
+ self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2522
+ return
2523
+
2524
+ # Extract video identifiers
2525
+ ids_in_page = []
2526
+ for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2527
+ if mobj.group(1) not in ids_in_page:
2528
+ ids_in_page.append(mobj.group(1))
2529
+ video_ids.extend(ids_in_page)
2530
+
2531
+ if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2532
+ break
2533
+ pagenum = pagenum + 1
2534
+
2535
+ playliststart = self._downloader.params.get('playliststart', 1) - 1
2536
+ playlistend = self._downloader.params.get('playlistend', -1)
2537
+ video_ids = video_ids[playliststart:playlistend]
2538
+
2539
+ for id in video_ids:
2540
+ self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2541
+ return
2542
+
2543
+
2544
+ class YoutubeUserIE(InfoExtractor):
2545
+ """Information Extractor for YouTube users."""
2546
+
2547
+ _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2548
+ _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2549
+ _GDATA_PAGE_SIZE = 50
2550
+ _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2551
+ _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2552
+ _youtube_ie = None
2553
+ IE_NAME = u'youtube:user'
2554
+
2555
+ def __init__(self, youtube_ie, downloader=None):
2556
+ InfoExtractor.__init__(self, downloader)
2557
+ self._youtube_ie = youtube_ie
2558
+
2559
+ def report_download_page(self, username, start_index):
2560
+ """Report attempt to download user page."""
2561
+ self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2562
+ (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2563
+
2564
+ def _real_initialize(self):
2565
+ self._youtube_ie.initialize()
2566
+
2567
+ def _real_extract(self, url):
2568
+ # Extract username
2569
+ mobj = re.match(self._VALID_URL, url)
2570
+ if mobj is None:
2571
+ self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2572
+ return
2573
+
2574
+ username = mobj.group(1)
2575
+
2576
+ # Download video ids using YouTube Data API. Result size per
2577
+ # query is limited (currently to 50 videos) so we need to query
2578
+ # page by page until there are no video ids - it means we got
2579
+ # all of them.
2580
+
2581
+ video_ids = []
2582
+ pagenum = 0
2583
+
2584
+ while True:
2585
+ start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2586
+ self.report_download_page(username, start_index)
2587
+
2588
+ request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2589
+
2590
+ try:
2591
+ page = urllib2.urlopen(request).read()
2592
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2593
+ self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2594
+ return
2595
+
2596
+ # Extract video identifiers
2597
+ ids_in_page = []
2598
+
2599
+ for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2600
+ if mobj.group(1) not in ids_in_page:
2601
+ ids_in_page.append(mobj.group(1))
2602
+
2603
+ video_ids.extend(ids_in_page)
2604
+
2605
+ # A little optimization - if current page is not
2606
+ # "full", ie. does not contain PAGE_SIZE video ids then
2607
+ # we can assume that this page is the last one - there
2608
+ # are no more ids on further pages - no need to query
2609
+ # again.
2610
+
2611
+ if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2612
+ break
2613
+
2614
+ pagenum += 1
2615
+
2616
+ all_ids_count = len(video_ids)
2617
+ playliststart = self._downloader.params.get('playliststart', 1) - 1
2618
+ playlistend = self._downloader.params.get('playlistend', -1)
2619
+
2620
+ if playlistend == -1:
2621
+ video_ids = video_ids[playliststart:]
2622
+ else:
2623
+ video_ids = video_ids[playliststart:playlistend]
2624
+
2625
+ self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2626
+ (username, all_ids_count, len(video_ids)))
2627
+
2628
+ for video_id in video_ids:
2629
+ self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2630
+
2631
+
2632
+ class DepositFilesIE(InfoExtractor):
2633
+ """Information extractor for depositfiles.com"""
2634
+
2635
+ _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2636
+ IE_NAME = u'DepositFiles'
2637
+
2638
+ def __init__(self, downloader=None):
2639
+ InfoExtractor.__init__(self, downloader)
2640
+
2641
+ def report_download_webpage(self, file_id):
2642
+ """Report webpage download."""
2643
+ self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2644
+
2645
+ def report_extraction(self, file_id):
2646
+ """Report information extraction."""
2647
+ self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2648
+
2649
+ def _real_initialize(self):
2650
+ return
2651
+
2652
+ def _real_extract(self, url):
2653
+ # At this point we have a new file
2654
+ self._downloader.increment_downloads()
2655
+
2656
+ file_id = url.split('/')[-1]
2657
+ # Rebuild url in english locale
2658
+ url = 'http://depositfiles.com/en/files/' + file_id
2659
+
2660
+ # Retrieve file webpage with 'Free download' button pressed
2661
+ free_download_indication = { 'gateway_result' : '1' }
2662
+ request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2663
+ try:
2664
+ self.report_download_webpage(file_id)
2665
+ webpage = urllib2.urlopen(request).read()
2666
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2667
+ self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2668
+ return
2669
+
2670
+ # Search for the real file URL
2671
+ mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2672
+ if (mobj is None) or (mobj.group(1) is None):
2673
+ # Try to figure out reason of the error.
2674
+ mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2675
+ if (mobj is not None) and (mobj.group(1) is not None):
2676
+ restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2677
+ self._downloader.trouble(u'ERROR: %s' % restriction_message)
2678
+ else:
2679
+ self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2680
+ return
2681
+
2682
+ file_url = mobj.group(1)
2683
+ file_extension = os.path.splitext(file_url)[1][1:]
2684
+
2685
+ # Search for file title
2686
+ mobj = re.search(r'<b title="(.*?)">', webpage)
2687
+ if mobj is None:
2688
+ self._downloader.trouble(u'ERROR: unable to extract title')
2689
+ return
2690
+ file_title = mobj.group(1).decode('utf-8')
2691
+
2692
+ try:
2693
+ # Process file information
2694
+ self._downloader.process_info({
2695
+ 'id': file_id.decode('utf-8'),
2696
+ 'url': file_url.decode('utf-8'),
2697
+ 'uploader': u'NA',
2698
+ 'upload_date': u'NA',
2699
+ 'title': file_title,
2700
+ 'stitle': file_title,
2701
+ 'ext': file_extension.decode('utf-8'),
2702
+ 'format': u'NA',
2703
+ 'player_url': None,
2704
+ })
2705
+ except UnavailableVideoError, err:
2706
+ self._downloader.trouble(u'ERROR: unable to download file')
2707
+
2708
+
2709
+ class FacebookIE(InfoExtractor):
2710
+ """Information Extractor for Facebook"""
2711
+
2712
+ _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/video/video\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2713
+ _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2714
+ _NETRC_MACHINE = 'facebook'
2715
+ _available_formats = ['highqual', 'lowqual']
2716
+ _video_extensions = {
2717
+ 'highqual': 'mp4',
2718
+ 'lowqual': 'mp4',
2719
+ }
2720
+ IE_NAME = u'facebook'
2721
+
2722
+ def __init__(self, downloader=None):
2723
+ InfoExtractor.__init__(self, downloader)
2724
+
2725
+ def _reporter(self, message):
2726
+ """Add header and report message."""
2727
+ self._downloader.to_screen(u'[facebook] %s' % message)
2728
+
2729
+ def report_login(self):
2730
+ """Report attempt to log in."""
2731
+ self._reporter(u'Logging in')
2732
+
2733
+ def report_video_webpage_download(self, video_id):
2734
+ """Report attempt to download video webpage."""
2735
+ self._reporter(u'%s: Downloading video webpage' % video_id)
2736
+
2737
+ def report_information_extraction(self, video_id):
2738
+ """Report attempt to extract video information."""
2739
+ self._reporter(u'%s: Extracting video information' % video_id)
2740
+
2741
+ def _parse_page(self, video_webpage):
2742
+ """Extract video information from page"""
2743
+ # General data
2744
+ data = {'title': r'class="video_title datawrap">(.*?)</',
2745
+ 'description': r'<div class="datawrap">(.*?)</div>',
2746
+ 'owner': r'\("video_owner_name", "(.*?)"\)',
2747
+ 'upload_date': r'data-date="(.*?)"',
2748
+ 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2749
+ }
2750
+ video_info = {}
2751
+ for piece in data.keys():
2752
+ mobj = re.search(data[piece], video_webpage)
2753
+ if mobj is not None:
2754
+ video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2755
+
2756
+ # Video urls
2757
+ video_urls = {}
2758
+ for fmt in self._available_formats:
2759
+ mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2760
+ if mobj is not None:
2761
+ # URL is in a Javascript segment inside an escaped Unicode format within
2762
+ # the generally utf-8 page
2763
+ video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2764
+ video_info['video_urls'] = video_urls
2765
+
2766
+ return video_info
2767
+
2768
+ def _real_initialize(self):
2769
+ if self._downloader is None:
2770
+ return
2771
+
2772
+ useremail = None
2773
+ password = None
2774
+ downloader_params = self._downloader.params
2775
+
2776
+ # Attempt to use provided username and password or .netrc data
2777
+ if downloader_params.get('username', None) is not None:
2778
+ useremail = downloader_params['username']
2779
+ password = downloader_params['password']
2780
+ elif downloader_params.get('usenetrc', False):
2781
+ try:
2782
+ info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2783
+ if info is not None:
2784
+ useremail = info[0]
2785
+ password = info[2]
2786
+ else:
2787
+ raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2788
+ except (IOError, netrc.NetrcParseError), err:
2789
+ self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2790
+ return
2791
+
2792
+ if useremail is None:
2793
+ return
2794
+
2795
+ # Log in
2796
+ login_form = {
2797
+ 'email': useremail,
2798
+ 'pass': password,
2799
+ 'login': 'Log+In'
2800
+ }
2801
+ request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2802
+ try:
2803
+ self.report_login()
2804
+ login_results = urllib2.urlopen(request).read()
2805
+ if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2806
+ self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2807
+ return
2808
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2809
+ self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2810
+ return
2811
+
2812
+ def _real_extract(self, url):
2813
+ mobj = re.match(self._VALID_URL, url)
2814
+ if mobj is None:
2815
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2816
+ return
2817
+ video_id = mobj.group('ID')
2818
+
2819
+ # Get video webpage
2820
+ self.report_video_webpage_download(video_id)
2821
+ request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2822
+ try:
2823
+ page = urllib2.urlopen(request)
2824
+ video_webpage = page.read()
2825
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2826
+ self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2827
+ return
2828
+
2829
+ # Start extracting information
2830
+ self.report_information_extraction(video_id)
2831
+
2832
+ # Extract information
2833
+ video_info = self._parse_page(video_webpage)
2834
+
2835
+ # uploader
2836
+ if 'owner' not in video_info:
2837
+ self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2838
+ return
2839
+ video_uploader = video_info['owner']
2840
+
2841
+ # title
2842
+ if 'title' not in video_info:
2843
+ self._downloader.trouble(u'ERROR: unable to extract video title')
2844
+ return
2845
+ video_title = video_info['title']
2846
+ video_title = video_title.decode('utf-8')
2847
+ video_title = sanitize_title(video_title)
2848
+
2849
+ # simplified title
2850
+ simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2851
+ simple_title = simple_title.strip(ur'_')
2852
+
2853
+ # thumbnail image
2854
+ if 'thumbnail' not in video_info:
2855
+ self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2856
+ video_thumbnail = ''
2857
+ else:
2858
+ video_thumbnail = video_info['thumbnail']
2859
+
2860
+ # upload date
2861
+ upload_date = u'NA'
2862
+ if 'upload_date' in video_info:
2863
+ upload_time = video_info['upload_date']
2864
+ timetuple = email.utils.parsedate_tz(upload_time)
2865
+ if timetuple is not None:
2866
+ try:
2867
+ upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2868
+ except:
2869
+ pass
2870
+
2871
+ # description
2872
+ video_description = video_info.get('description', 'No description available.')
2873
+
2874
+ url_map = video_info['video_urls']
2875
+ if len(url_map.keys()) > 0:
2876
+ # Decide which formats to download
2877
+ req_format = self._downloader.params.get('format', None)
2878
+ format_limit = self._downloader.params.get('format_limit', None)
2879
+
2880
+ if format_limit is not None and format_limit in self._available_formats:
2881
+ format_list = self._available_formats[self._available_formats.index(format_limit):]
2882
+ else:
2883
+ format_list = self._available_formats
2884
+ existing_formats = [x for x in format_list if x in url_map]
2885
+ if len(existing_formats) == 0:
2886
+ self._downloader.trouble(u'ERROR: no known formats available for video')
2887
+ return
2888
+ if req_format is None:
2889
+ video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2890
+ elif req_format == 'worst':
2891
+ video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2892
+ elif req_format == '-1':
2893
+ video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2894
+ else:
2895
+ # Specific format
2896
+ if req_format not in url_map:
2897
+ self._downloader.trouble(u'ERROR: requested format not available')
2898
+ return
2899
+ video_url_list = [(req_format, url_map[req_format])] # Specific format
2900
+
2901
+ for format_param, video_real_url in video_url_list:
2902
+
2903
+ # At this point we have a new video
2904
+ self._downloader.increment_downloads()
2905
+
2906
+ # Extension
2907
+ video_extension = self._video_extensions.get(format_param, 'mp4')
2908
+
2909
+ try:
2910
+ # Process video information
2911
+ self._downloader.process_info({
2912
+ 'id': video_id.decode('utf-8'),
2913
+ 'url': video_real_url.decode('utf-8'),
2914
+ 'uploader': video_uploader.decode('utf-8'),
2915
+ 'upload_date': upload_date,
2916
+ 'title': video_title,
2917
+ 'stitle': simple_title,
2918
+ 'ext': video_extension.decode('utf-8'),
2919
+ 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2920
+ 'thumbnail': video_thumbnail.decode('utf-8'),
2921
+ 'description': video_description.decode('utf-8'),
2922
+ 'player_url': None,
2923
+ })
2924
+ except UnavailableVideoError, err:
2925
+ self._downloader.trouble(u'\nERROR: unable to download video')
2926
+
2927
+ class BlipTVIE(InfoExtractor):
2928
+ """Information extractor for blip.tv"""
2929
+
2930
+ _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2931
+ _URL_EXT = r'^.*\.([a-z0-9]+)$'
2932
+ IE_NAME = u'blip.tv'
2933
+
2934
+ def report_extraction(self, file_id):
2935
+ """Report information extraction."""
2936
+ self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2937
+
2938
+ def report_direct_download(self, title):
2939
+ """Report information extraction."""
2940
+ self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2941
+
2942
+ def _simplify_title(self, title):
2943
+ res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2944
+ res = res.strip(ur'_')
2945
+ return res
2946
+
2947
+ def _real_extract(self, url):
2948
+ mobj = re.match(self._VALID_URL, url)
2949
+ if mobj is None:
2950
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2951
+ return
2952
+
2953
+ if '?' in url:
2954
+ cchar = '&'
2955
+ else:
2956
+ cchar = '?'
2957
+ json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2958
+ request = urllib2.Request(json_url)
2959
+ self.report_extraction(mobj.group(1))
2960
+ info = None
2961
+ try:
2962
+ urlh = urllib2.urlopen(request)
2963
+ if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2964
+ basename = url.split('/')[-1]
2965
+ title,ext = os.path.splitext(basename)
2966
+ ext = ext.replace('.', '')
2967
+ self.report_direct_download(title)
2968
+ info = {
2969
+ 'id': title,
2970
+ 'url': url,
2971
+ 'title': title,
2972
+ 'stitle': self._simplify_title(title),
2973
+ 'ext': ext,
2974
+ 'urlhandle': urlh
2975
+ }
2976
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2977
+ self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2978
+ return
2979
+ if info is None: # Regular URL
2980
+ try:
2981
+ json_code = urlh.read()
2982
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2983
+ self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2984
+ return
2985
+
2986
+ try:
2987
+ json_data = json.loads(json_code)
2988
+ if 'Post' in json_data:
2989
+ data = json_data['Post']
2990
+ else:
2991
+ data = json_data
2992
+
2993
+ upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2994
+ video_url = data['media']['url']
2995
+ umobj = re.match(self._URL_EXT, video_url)
2996
+ if umobj is None:
2997
+ raise ValueError('Can not determine filename extension')
2998
+ ext = umobj.group(1)
2999
+
3000
+ info = {
3001
+ 'id': data['item_id'],
3002
+ 'url': video_url,
3003
+ 'uploader': data['display_name'],
3004
+ 'upload_date': upload_date,
3005
+ 'title': data['title'],
3006
+ 'stitle': self._simplify_title(data['title']),
3007
+ 'ext': ext,
3008
+ 'format': data['media']['mimeType'],
3009
+ 'thumbnail': data['thumbnailUrl'],
3010
+ 'description': data['description'],
3011
+ 'player_url': data['embedUrl']
3012
+ }
3013
+ except (ValueError,KeyError), err:
3014
+ self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3015
+ return
3016
+
3017
+ self._downloader.increment_downloads()
3018
+
3019
+ try:
3020
+ self._downloader.process_info(info)
3021
+ except UnavailableVideoError, err:
3022
+ self._downloader.trouble(u'\nERROR: unable to download video')
3023
+
3024
+
3025
+ class MyVideoIE(InfoExtractor):
3026
+ """Information Extractor for myvideo.de."""
3027
+
3028
+ _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3029
+ IE_NAME = u'myvideo'
3030
+
3031
+ def __init__(self, downloader=None):
3032
+ InfoExtractor.__init__(self, downloader)
3033
+
3034
+ def report_download_webpage(self, video_id):
3035
+ """Report webpage download."""
3036
+ self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3037
+
3038
+ def report_extraction(self, video_id):
3039
+ """Report information extraction."""
3040
+ self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3041
+
3042
+ def _real_initialize(self):
3043
+ return
3044
+
3045
+ def _real_extract(self,url):
3046
+ mobj = re.match(self._VALID_URL, url)
3047
+ if mobj is None:
3048
+ self._download.trouble(u'ERROR: invalid URL: %s' % url)
3049
+ return
3050
+
3051
+ video_id = mobj.group(1)
3052
+ simple_title = mobj.group(2).decode('utf-8')
3053
+ # should actually not be necessary
3054
+ simple_title = sanitize_title(simple_title)
3055
+ simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3056
+
3057
+ # Get video webpage
3058
+ request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3059
+ try:
3060
+ self.report_download_webpage(video_id)
3061
+ webpage = urllib2.urlopen(request).read()
3062
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3063
+ self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3064
+ return
3065
+
3066
+ self.report_extraction(video_id)
3067
+ mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3068
+ webpage)
3069
+ if mobj is None:
3070
+ self._downloader.trouble(u'ERROR: unable to extract media URL')
3071
+ return
3072
+ video_url = mobj.group(1) + ('/%s.flv' % video_id)
3073
+
3074
+ mobj = re.search('<title>([^<]+)</title>', webpage)
3075
+ if mobj is None:
3076
+ self._downloader.trouble(u'ERROR: unable to extract title')
3077
+ return
3078
+
3079
+ video_title = mobj.group(1)
3080
+ video_title = sanitize_title(video_title)
3081
+
3082
+ try:
3083
+ self._downloader.process_info({
3084
+ 'id': video_id,
3085
+ 'url': video_url,
3086
+ 'uploader': u'NA',
3087
+ 'upload_date': u'NA',
3088
+ 'title': video_title,
3089
+ 'stitle': simple_title,
3090
+ 'ext': u'flv',
3091
+ 'format': u'NA',
3092
+ 'player_url': None,
3093
+ })
3094
+ except UnavailableVideoError:
3095
+ self._downloader.trouble(u'\nERROR: Unable to download video')
3096
+
3097
+ class ComedyCentralIE(InfoExtractor):
3098
+ """Information extractor for The Daily Show and Colbert Report """
3099
+
3100
+ _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3101
+ IE_NAME = u'comedycentral'
3102
+
3103
+ def report_extraction(self, episode_id):
3104
+ self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3105
+
3106
+ def report_config_download(self, episode_id):
3107
+ self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3108
+
3109
+ def report_index_download(self, episode_id):
3110
+ self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3111
+
3112
+ def report_player_url(self, episode_id):
3113
+ self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3114
+
3115
+ def _simplify_title(self, title):
3116
+ res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3117
+ res = res.strip(ur'_')
3118
+ return res
3119
+
3120
+ def _real_extract(self, url):
3121
+ mobj = re.match(self._VALID_URL, url)
3122
+ if mobj is None:
3123
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3124
+ return
3125
+
3126
+ if mobj.group('shortname'):
3127
+ if mobj.group('shortname') in ('tds', 'thedailyshow'):
3128
+ url = 'http://www.thedailyshow.com/full-episodes/'
3129
+ else:
3130
+ url = 'http://www.colbertnation.com/full-episodes/'
3131
+ mobj = re.match(self._VALID_URL, url)
3132
+ assert mobj is not None
3133
+
3134
+ dlNewest = not mobj.group('episode')
3135
+ if dlNewest:
3136
+ epTitle = mobj.group('showname')
3137
+ else:
3138
+ epTitle = mobj.group('episode')
3139
+
3140
+ req = urllib2.Request(url)
3141
+ self.report_extraction(epTitle)
3142
+ try:
3143
+ htmlHandle = urllib2.urlopen(req)
3144
+ html = htmlHandle.read()
3145
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3146
+ self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3147
+ return
3148
+ if dlNewest:
3149
+ url = htmlHandle.geturl()
3150
+ mobj = re.match(self._VALID_URL, url)
3151
+ if mobj is None:
3152
+ self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3153
+ return
3154
+ if mobj.group('episode') == '':
3155
+ self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3156
+ return
3157
+ epTitle = mobj.group('episode')
3158
+
3159
+ mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3160
+ if len(mMovieParams) == 0:
3161
+ self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3162
+ return
3163
+
3164
+ playerUrl_raw = mMovieParams[0][0]
3165
+ self.report_player_url(epTitle)
3166
+ try:
3167
+ urlHandle = urllib2.urlopen(playerUrl_raw)
3168
+ playerUrl = urlHandle.geturl()
3169
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3170
+ self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3171
+ return
3172
+
3173
+ uri = mMovieParams[0][1]
3174
+ indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3175
+ self.report_index_download(epTitle)
3176
+ try:
3177
+ indexXml = urllib2.urlopen(indexUrl).read()
3178
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3179
+ self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3180
+ return
3181
+
3182
+ idoc = xml.etree.ElementTree.fromstring(indexXml)
3183
+ itemEls = idoc.findall('.//item')
3184
+ for itemEl in itemEls:
3185
+ mediaId = itemEl.findall('./guid')[0].text
3186
+ shortMediaId = mediaId.split(':')[-1]
3187
+ showId = mediaId.split(':')[-2].replace('.com', '')
3188
+ officialTitle = itemEl.findall('./title')[0].text
3189
+ officialDate = itemEl.findall('./pubDate')[0].text
3190
+
3191
+ configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3192
+ urllib.urlencode({'uri': mediaId}))
3193
+ configReq = urllib2.Request(configUrl)
3194
+ self.report_config_download(epTitle)
3195
+ try:
3196
+ configXml = urllib2.urlopen(configReq).read()
3197
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3198
+ self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3199
+ return
3200
+
3201
+ cdoc = xml.etree.ElementTree.fromstring(configXml)
3202
+ turls = []
3203
+ for rendition in cdoc.findall('.//rendition'):
3204
+ finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3205
+ turls.append(finfo)
3206
+
3207
+ if len(turls) == 0:
3208
+ self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3209
+ continue
3210
+
3211
+ # For now, just pick the highest bitrate
3212
+ format,video_url = turls[-1]
3213
+
3214
+ self._downloader.increment_downloads()
3215
+
3216
+ effTitle = showId + '-' + epTitle
3217
+ info = {
3218
+ 'id': shortMediaId,
3219
+ 'url': video_url,
3220
+ 'uploader': showId,
3221
+ 'upload_date': officialDate,
3222
+ 'title': effTitle,
3223
+ 'stitle': self._simplify_title(effTitle),
3224
+ 'ext': 'mp4',
3225
+ 'format': format,
3226
+ 'thumbnail': None,
3227
+ 'description': officialTitle,
3228
+ 'player_url': playerUrl
3229
+ }
3230
+
3231
+ try:
3232
+ self._downloader.process_info(info)
3233
+ except UnavailableVideoError, err:
3234
+ self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3235
+ continue
3236
+
3237
+
3238
+ class EscapistIE(InfoExtractor):
3239
+ """Information extractor for The Escapist """
3240
+
3241
+ _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3242
+ IE_NAME = u'escapist'
3243
+
3244
+ def report_extraction(self, showName):
3245
+ self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3246
+
3247
+ def report_config_download(self, showName):
3248
+ self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3249
+
3250
+ def _simplify_title(self, title):
3251
+ res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3252
+ res = res.strip(ur'_')
3253
+ return res
3254
+
3255
+ def _real_extract(self, url):
3256
+ htmlParser = HTMLParser.HTMLParser()
3257
+
3258
+ mobj = re.match(self._VALID_URL, url)
3259
+ if mobj is None:
3260
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3261
+ return
3262
+ showName = mobj.group('showname')
3263
+ videoId = mobj.group('episode')
3264
+
3265
+ self.report_extraction(showName)
3266
+ try:
3267
+ webPage = urllib2.urlopen(url).read()
3268
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3269
+ self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3270
+ return
3271
+
3272
+ descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3273
+ description = htmlParser.unescape(descMatch.group(1))
3274
+ imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3275
+ imgUrl = htmlParser.unescape(imgMatch.group(1))
3276
+ playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3277
+ playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3278
+ configUrlMatch = re.search('config=(.*)$', playerUrl)
3279
+ configUrl = urllib2.unquote(configUrlMatch.group(1))
3280
+
3281
+ self.report_config_download(showName)
3282
+ try:
3283
+ configJSON = urllib2.urlopen(configUrl).read()
3284
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3285
+ self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3286
+ return
3287
+
3288
+ # Technically, it's JavaScript, not JSON
3289
+ configJSON = configJSON.replace("'", '"')
3290
+
3291
+ try:
3292
+ config = json.loads(configJSON)
3293
+ except (ValueError,), err:
3294
+ self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3295
+ return
3296
+
3297
+ playlist = config['playlist']
3298
+ videoUrl = playlist[1]['url']
3299
+
3300
+ self._downloader.increment_downloads()
3301
+ info = {
3302
+ 'id': videoId,
3303
+ 'url': videoUrl,
3304
+ 'uploader': showName,
3305
+ 'upload_date': None,
3306
+ 'title': showName,
3307
+ 'stitle': self._simplify_title(showName),
3308
+ 'ext': 'flv',
3309
+ 'format': 'flv',
3310
+ 'thumbnail': imgUrl,
3311
+ 'description': description,
3312
+ 'player_url': playerUrl,
3313
+ }
3314
+
3315
+ try:
3316
+ self._downloader.process_info(info)
3317
+ except UnavailableVideoError, err:
3318
+ self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3319
+
3320
+
3321
+ class CollegeHumorIE(InfoExtractor):
3322
+ """Information extractor for collegehumor.com"""
3323
+
3324
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3325
+ IE_NAME = u'collegehumor'
3326
+
3327
+ def report_webpage(self, video_id):
3328
+ """Report information extraction."""
3329
+ self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3330
+
3331
+ def report_extraction(self, video_id):
3332
+ """Report information extraction."""
3333
+ self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3334
+
3335
+ def _simplify_title(self, title):
3336
+ res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3337
+ res = res.strip(ur'_')
3338
+ return res
3339
+
3340
+ def _real_extract(self, url):
3341
+ htmlParser = HTMLParser.HTMLParser()
3342
+
3343
+ mobj = re.match(self._VALID_URL, url)
3344
+ if mobj is None:
3345
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3346
+ return
3347
+ video_id = mobj.group('videoid')
3348
+
3349
+ self.report_webpage(video_id)
3350
+ request = urllib2.Request(url)
3351
+ try:
3352
+ webpage = urllib2.urlopen(request).read()
3353
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3354
+ self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3355
+ return
3356
+
3357
+ m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3358
+ if m is None:
3359
+ self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3360
+ return
3361
+ internal_video_id = m.group('internalvideoid')
3362
+
3363
+ info = {
3364
+ 'id': video_id,
3365
+ 'internal_id': internal_video_id,
3366
+ }
3367
+
3368
+ self.report_extraction(video_id)
3369
+ xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3370
+ try:
3371
+ metaXml = urllib2.urlopen(xmlUrl).read()
3372
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3373
+ self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3374
+ return
3375
+
3376
+ mdoc = xml.etree.ElementTree.fromstring(metaXml)
3377
+ try:
3378
+ videoNode = mdoc.findall('./video')[0]
3379
+ info['description'] = videoNode.findall('./description')[0].text
3380
+ info['title'] = videoNode.findall('./caption')[0].text
3381
+ info['stitle'] = self._simplify_title(info['title'])
3382
+ info['url'] = videoNode.findall('./file')[0].text
3383
+ info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3384
+ info['ext'] = info['url'].rpartition('.')[2]
3385
+ info['format'] = info['ext']
3386
+ except IndexError:
3387
+ self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3388
+ return
3389
+
3390
+ self._downloader.increment_downloads()
3391
+
3392
+ try:
3393
+ self._downloader.process_info(info)
3394
+ except UnavailableVideoError, err:
3395
+ self._downloader.trouble(u'\nERROR: unable to download video')
3396
+
3397
+
3398
+ class XVideosIE(InfoExtractor):
3399
+ """Information extractor for xvideos.com"""
3400
+
3401
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3402
+ IE_NAME = u'xvideos'
3403
+
3404
+ def report_webpage(self, video_id):
3405
+ """Report information extraction."""
3406
+ self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3407
+
3408
+ def report_extraction(self, video_id):
3409
+ """Report information extraction."""
3410
+ self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3411
+
3412
+ def _simplify_title(self, title):
3413
+ res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3414
+ res = res.strip(ur'_')
3415
+ return res
3416
+
3417
+ def _real_extract(self, url):
3418
+ htmlParser = HTMLParser.HTMLParser()
3419
+
3420
+ mobj = re.match(self._VALID_URL, url)
3421
+ if mobj is None:
3422
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3423
+ return
3424
+ video_id = mobj.group(1).decode('utf-8')
3425
+
3426
+ self.report_webpage(video_id)
3427
+
3428
+ request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3429
+ try:
3430
+ webpage = urllib2.urlopen(request).read()
3431
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3432
+ self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3433
+ return
3434
+
3435
+ self.report_extraction(video_id)
3436
+
3437
+
3438
+ # Extract video URL
3439
+ mobj = re.search(r'flv_url=(.+?)&', webpage)
3440
+ if mobj is None:
3441
+ self._downloader.trouble(u'ERROR: unable to extract video url')
3442
+ return
3443
+ video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3444
+
3445
+
3446
+ # Extract title
3447
+ mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3448
+ if mobj is None:
3449
+ self._downloader.trouble(u'ERROR: unable to extract video title')
3450
+ return
3451
+ video_title = mobj.group(1).decode('utf-8')
3452
+
3453
+
3454
+ # Extract video thumbnail
3455
+ mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3456
+ if mobj is None:
3457
+ self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3458
+ return
3459
+ video_thumbnail = mobj.group(1).decode('utf-8')
3460
+
3461
+
3462
+
3463
+ self._downloader.increment_downloads()
3464
+ info = {
3465
+ 'id': video_id,
3466
+ 'url': video_url,
3467
+ 'uploader': None,
3468
+ 'upload_date': None,
3469
+ 'title': video_title,
3470
+ 'stitle': self._simplify_title(video_title),
3471
+ 'ext': 'flv',
3472
+ 'format': 'flv',
3473
+ 'thumbnail': video_thumbnail,
3474
+ 'description': None,
3475
+ 'player_url': None,
3476
+ }
3477
+
3478
+ try:
3479
+ self._downloader.process_info(info)
3480
+ except UnavailableVideoError, err:
3481
+ self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3482
+
3483
+
3484
+ class PostProcessor(object):
3485
+ """Post Processor class.
3486
+
3487
+ PostProcessor objects can be added to downloaders with their
3488
+ add_post_processor() method. When the downloader has finished a
3489
+ successful download, it will take its internal chain of PostProcessors
3490
+ and start calling the run() method on each one of them, first with
3491
+ an initial argument and then with the returned value of the previous
3492
+ PostProcessor.
3493
+
3494
+ The chain will be stopped if one of them ever returns None or the end
3495
+ of the chain is reached.
3496
+
3497
+ PostProcessor objects follow a "mutual registration" process similar
3498
+ to InfoExtractor objects.
3499
+ """
3500
+
3501
+ _downloader = None
3502
+
3503
+ def __init__(self, downloader=None):
3504
+ self._downloader = downloader
3505
+
3506
+ def set_downloader(self, downloader):
3507
+ """Sets the downloader for this PP."""
3508
+ self._downloader = downloader
3509
+
3510
+ def run(self, information):
3511
+ """Run the PostProcessor.
3512
+
3513
+ The "information" argument is a dictionary like the ones
3514
+ composed by InfoExtractors. The only difference is that this
3515
+ one has an extra field called "filepath" that points to the
3516
+ downloaded file.
3517
+
3518
+ When this method returns None, the postprocessing chain is
3519
+ stopped. However, this method may return an information
3520
+ dictionary that will be passed to the next postprocessing
3521
+ object in the chain. It can be the one it received after
3522
+ changing some fields.
3523
+
3524
+ In addition, this method may raise a PostProcessingError
3525
+ exception that will be taken into account by the downloader
3526
+ it was called from.
3527
+ """
3528
+ return information # by default, do nothing
3529
+
3530
+
3531
+ class FFmpegExtractAudioPP(PostProcessor):
3532
+
3533
+ def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3534
+ PostProcessor.__init__(self, downloader)
3535
+ if preferredcodec is None:
3536
+ preferredcodec = 'best'
3537
+ self._preferredcodec = preferredcodec
3538
+ self._preferredquality = preferredquality
3539
+ self._keepvideo = keepvideo
3540
+
3541
+ @staticmethod
3542
+ def get_audio_codec(path):
3543
+ try:
3544
+ cmd = ['ffprobe', '-show_streams', '--', path]
3545
+ handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3546
+ output = handle.communicate()[0]
3547
+ if handle.wait() != 0:
3548
+ return None
3549
+ except (IOError, OSError):
3550
+ return None
3551
+ audio_codec = None
3552
+ for line in output.split('\n'):
3553
+ if line.startswith('codec_name='):
3554
+ audio_codec = line.split('=')[1].strip()
3555
+ elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3556
+ return audio_codec
3557
+ return None
3558
+
3559
+ @staticmethod
3560
+ def run_ffmpeg(path, out_path, codec, more_opts):
3561
+ try:
3562
+ cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3563
+ ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3564
+ return (ret == 0)
3565
+ except (IOError, OSError):
3566
+ return False
3567
+
3568
+ def run(self, information):
3569
+ path = information['filepath']
3570
+
3571
+ filecodec = self.get_audio_codec(path)
3572
+ if filecodec is None:
3573
+ self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3574
+ return None
3575
+
3576
+ more_opts = []
3577
+ if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3578
+ if filecodec in ['aac', 'mp3', 'vorbis']:
3579
+ # Lossless if possible
3580
+ acodec = 'copy'
3581
+ extension = filecodec
3582
+ if filecodec == 'aac':
3583
+ more_opts = ['-f', 'adts']
3584
+ if filecodec == 'vorbis':
3585
+ extension = 'ogg'
3586
+ else:
3587
+ # MP3 otherwise.
3588
+ acodec = 'libmp3lame'
3589
+ extension = 'mp3'
3590
+ more_opts = []
3591
+ if self._preferredquality is not None:
3592
+ more_opts += ['-ab', self._preferredquality]
3593
+ else:
3594
+ # We convert the audio (lossy)
3595
+ acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3596
+ extension = self._preferredcodec
3597
+ more_opts = []
3598
+ if self._preferredquality is not None:
3599
+ more_opts += ['-ab', self._preferredquality]
3600
+ if self._preferredcodec == 'aac':
3601
+ more_opts += ['-f', 'adts']
3602
+ if self._preferredcodec == 'vorbis':
3603
+ extension = 'ogg'
3604
+
3605
+ (prefix, ext) = os.path.splitext(path)
3606
+ new_path = prefix + '.' + extension
3607
+ self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3608
+ status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3609
+
3610
+ if not status:
3611
+ self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3612
+ return None
3613
+
3614
+ # Try to update the date time for extracted audio file.
3615
+ if information.get('filetime') is not None:
3616
+ try:
3617
+ os.utime(new_path, (time.time(), information['filetime']))
3618
+ except:
3619
+ self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3620
+
3621
+ if not self._keepvideo:
3622
+ try:
3623
+ os.remove(path)
3624
+ except (IOError, OSError):
3625
+ self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3626
+ return None
3627
+
3628
+ information['filepath'] = new_path
3629
+ return information
3630
+
3631
+
3632
+ def updateSelf(downloader, filename):
3633
+ ''' Update the program file with the latest version from the repository '''
3634
+ # Note: downloader only used for options
3635
+ if not os.access(filename, os.W_OK):
3636
+ sys.exit('ERROR: no write permissions on %s' % filename)
3637
+
3638
+ downloader.to_screen('Updating to latest version...')
3639
+
3640
+ try:
3641
+ try:
3642
+ urlh = urllib.urlopen(UPDATE_URL)
3643
+ newcontent = urlh.read()
3644
+
3645
+ vmatch = re.search("__version__ = '([^']+)'", newcontent)
3646
+ if vmatch is not None and vmatch.group(1) == __version__:
3647
+ downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3648
+ return
3649
+ finally:
3650
+ urlh.close()
3651
+ except (IOError, OSError), err:
3652
+ sys.exit('ERROR: unable to download latest version')
3653
+
3654
+ try:
3655
+ outf = open(filename, 'wb')
3656
+ try:
3657
+ outf.write(newcontent)
3658
+ finally:
3659
+ outf.close()
3660
+ except (IOError, OSError), err:
3661
+ sys.exit('ERROR: unable to overwrite current version')
3662
+
3663
+ downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3664
+
3665
+ def parseOpts():
3666
+ # Deferred imports
3667
+ import getpass
3668
+ import optparse
3669
+
3670
+ def _format_option_string(option):
3671
+ ''' ('-o', '--option') -> -o, --format METAVAR'''
3672
+
3673
+ opts = []
3674
+
3675
+ if option._short_opts: opts.append(option._short_opts[0])
3676
+ if option._long_opts: opts.append(option._long_opts[0])
3677
+ if len(opts) > 1: opts.insert(1, ', ')
3678
+
3679
+ if option.takes_value(): opts.append(' %s' % option.metavar)
3680
+
3681
+ return "".join(opts)
3682
+
3683
+ def _find_term_columns():
3684
+ columns = os.environ.get('COLUMNS', None)
3685
+ if columns:
3686
+ return int(columns)
3687
+
3688
+ try:
3689
+ sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3690
+ out,err = sp.communicate()
3691
+ return int(out.split()[1])
3692
+ except:
3693
+ pass
3694
+ return None
3695
+
3696
+ max_width = 80
3697
+ max_help_position = 80
3698
+
3699
+ # No need to wrap help messages if we're on a wide console
3700
+ columns = _find_term_columns()
3701
+ if columns: max_width = columns
3702
+
3703
+ fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3704
+ fmt.format_option_strings = _format_option_string
3705
+
3706
+ kw = {
3707
+ 'version' : __version__,
3708
+ 'formatter' : fmt,
3709
+ 'usage' : '%prog [options] url [url...]',
3710
+ 'conflict_handler' : 'resolve',
3711
+ }
3712
+
3713
+ parser = optparse.OptionParser(**kw)
3714
+
3715
+ # option groups
3716
+ general = optparse.OptionGroup(parser, 'General Options')
3717
+ selection = optparse.OptionGroup(parser, 'Video Selection')
3718
+ authentication = optparse.OptionGroup(parser, 'Authentication Options')
3719
+ video_format = optparse.OptionGroup(parser, 'Video Format Options')
3720
+ postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3721
+ filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3722
+ verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3723
+
3724
+ general.add_option('-h', '--help',
3725
+ action='help', help='print this help text and exit')
3726
+ general.add_option('-v', '--version',
3727
+ action='version', help='print program version and exit')
3728
+ general.add_option('-U', '--update',
3729
+ action='store_true', dest='update_self', help='update this program to latest version')
3730
+ general.add_option('-i', '--ignore-errors',
3731
+ action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3732
+ general.add_option('-r', '--rate-limit',
3733
+ dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3734
+ general.add_option('-R', '--retries',
3735
+ dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3736
+ general.add_option('--dump-user-agent',
3737
+ action='store_true', dest='dump_user_agent',
3738
+ help='display the current browser identification', default=False)
3739
+ general.add_option('--list-extractors',
3740
+ action='store_true', dest='list_extractors',
3741
+ help='List all supported extractors and the URLs they would handle', default=False)
3742
+
3743
+ selection.add_option('--playlist-start',
3744
+ dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3745
+ selection.add_option('--playlist-end',
3746
+ dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3747
+ selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3748
+ selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3749
+
3750
+ authentication.add_option('-u', '--username',
3751
+ dest='username', metavar='USERNAME', help='account username')
3752
+ authentication.add_option('-p', '--password',
3753
+ dest='password', metavar='PASSWORD', help='account password')
3754
+ authentication.add_option('-n', '--netrc',
3755
+ action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3756
+
3757
+
3758
+ video_format.add_option('-f', '--format',
3759
+ action='store', dest='format', metavar='FORMAT', help='video format code')
3760
+ video_format.add_option('--all-formats',
3761
+ action='store_const', dest='format', help='download all available video formats', const='all')
3762
+ video_format.add_option('--max-quality',
3763
+ action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3764
+ video_format.add_option('-F', '--list-formats',
3765
+ action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
3766
+
3767
+
3768
+ verbosity.add_option('-q', '--quiet',
3769
+ action='store_true', dest='quiet', help='activates quiet mode', default=False)
3770
+ verbosity.add_option('-s', '--simulate',
3771
+ action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3772
+ verbosity.add_option('--skip-download',
3773
+ action='store_true', dest='skip_download', help='do not download the video', default=False)
3774
+ verbosity.add_option('-g', '--get-url',
3775
+ action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3776
+ verbosity.add_option('-e', '--get-title',
3777
+ action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3778
+ verbosity.add_option('--get-thumbnail',
3779
+ action='store_true', dest='getthumbnail',
3780
+ help='simulate, quiet but print thumbnail URL', default=False)
3781
+ verbosity.add_option('--get-description',
3782
+ action='store_true', dest='getdescription',
3783
+ help='simulate, quiet but print video description', default=False)
3784
+ verbosity.add_option('--get-filename',
3785
+ action='store_true', dest='getfilename',
3786
+ help='simulate, quiet but print output filename', default=False)
3787
+ verbosity.add_option('--get-format',
3788
+ action='store_true', dest='getformat',
3789
+ help='simulate, quiet but print output format', default=False)
3790
+ verbosity.add_option('--no-progress',
3791
+ action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3792
+ verbosity.add_option('--console-title',
3793
+ action='store_true', dest='consoletitle',
3794
+ help='display progress in console titlebar', default=False)
3795
+
3796
+
3797
+ filesystem.add_option('-t', '--title',
3798
+ action='store_true', dest='usetitle', help='use title in file name', default=False)
3799
+ filesystem.add_option('-l', '--literal',
3800
+ action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3801
+ filesystem.add_option('-A', '--auto-number',
3802
+ action='store_true', dest='autonumber',
3803
+ help='number downloaded files starting from 00000', default=False)
3804
+ filesystem.add_option('-o', '--output',
3805
+ dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
3806
+ filesystem.add_option('-a', '--batch-file',
3807
+ dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3808
+ filesystem.add_option('-w', '--no-overwrites',
3809
+ action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3810
+ filesystem.add_option('-c', '--continue',
3811
+ action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3812
+ filesystem.add_option('--no-continue',
3813
+ action='store_false', dest='continue_dl',
3814
+ help='do not resume partially downloaded files (restart from beginning)')
3815
+ filesystem.add_option('--cookies',
3816
+ dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
3817
+ filesystem.add_option('--no-part',
3818
+ action='store_true', dest='nopart', help='do not use .part files', default=False)
3819
+ filesystem.add_option('--no-mtime',
3820
+ action='store_false', dest='updatetime',
3821
+ help='do not use the Last-modified header to set the file modification time', default=True)
3822
+ filesystem.add_option('--write-description',
3823
+ action='store_true', dest='writedescription',
3824
+ help='write video description to a .description file', default=False)
3825
+ filesystem.add_option('--write-info-json',
3826
+ action='store_true', dest='writeinfojson',
3827
+ help='write video metadata to a .info.json file', default=False)
3828
+
3829
+
3830
+ postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3831
+ help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3832
+ postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3833
+ help='"best", "aac", "vorbis" or "mp3"; best by default')
3834
+ postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3835
+ help='ffmpeg audio bitrate specification, 128k by default')
3836
+ postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3837
+ help='keeps the video file on disk after the post-processing; the video is erased by default')
3838
+
3839
+
3840
+ parser.add_option_group(general)
3841
+ parser.add_option_group(selection)
3842
+ parser.add_option_group(filesystem)
3843
+ parser.add_option_group(verbosity)
3844
+ parser.add_option_group(video_format)
3845
+ parser.add_option_group(authentication)
3846
+ parser.add_option_group(postproc)
3847
+
3848
+ opts, args = parser.parse_args()
3849
+
3850
+ return parser, opts, args
3851
+
3852
+ def gen_extractors():
3853
+ """ Return a list of an instance of every supported extractor.
3854
+ The order does matter; the first extractor matched is the one handling the URL.
3855
+ """
3856
+ youtube_ie = YoutubeIE()
3857
+ google_ie = GoogleIE()
3858
+ yahoo_ie = YahooIE()
3859
+ return [
3860
+ YoutubePlaylistIE(youtube_ie),
3861
+ YoutubeUserIE(youtube_ie),
3862
+ YoutubeSearchIE(youtube_ie),
3863
+ youtube_ie,
3864
+ MetacafeIE(youtube_ie),
3865
+ DailymotionIE(),
3866
+ google_ie,
3867
+ GoogleSearchIE(google_ie),
3868
+ PhotobucketIE(),
3869
+ yahoo_ie,
3870
+ YahooSearchIE(yahoo_ie),
3871
+ DepositFilesIE(),
3872
+ FacebookIE(),
3873
+ BlipTVIE(),
3874
+ VimeoIE(),
3875
+ MyVideoIE(),
3876
+ ComedyCentralIE(),
3877
+ EscapistIE(),
3878
+ CollegeHumorIE(),
3879
+ XVideosIE(),
3880
+
3881
+ GenericIE()
3882
+ ]
3883
+
3884
+ def main():
3885
+ parser, opts, args = parseOpts()
3886
+
3887
+ # Open appropriate CookieJar
3888
+ if opts.cookiefile is None:
3889
+ jar = cookielib.CookieJar()
3890
+ else:
3891
+ try:
3892
+ jar = cookielib.MozillaCookieJar(opts.cookiefile)
3893
+ if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3894
+ jar.load()
3895
+ except (IOError, OSError), err:
3896
+ sys.exit(u'ERROR: unable to open cookie file')
3897
+
3898
+ # Dump user agent
3899
+ if opts.dump_user_agent:
3900
+ print std_headers['User-Agent']
3901
+ sys.exit(0)
3902
+
3903
+ # Batch file verification
3904
+ batchurls = []
3905
+ if opts.batchfile is not None:
3906
+ try:
3907
+ if opts.batchfile == '-':
3908
+ batchfd = sys.stdin
3909
+ else:
3910
+ batchfd = open(opts.batchfile, 'r')
3911
+ batchurls = batchfd.readlines()
3912
+ batchurls = [x.strip() for x in batchurls]
3913
+ batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3914
+ except IOError:
3915
+ sys.exit(u'ERROR: batch file could not be read')
3916
+ all_urls = batchurls + args
3917
+
3918
+ # General configuration
3919
+ cookie_processor = urllib2.HTTPCookieProcessor(jar)
3920
+ opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3921
+ urllib2.install_opener(opener)
3922
+ socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3923
+
3924
+ extractors = gen_extractors()
3925
+
3926
+ if opts.list_extractors:
3927
+ for ie in extractors:
3928
+ print(ie.IE_NAME)
3929
+ matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3930
+ all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3931
+ for mu in matchedUrls:
3932
+ print(u' ' + mu)
3933
+ sys.exit(0)
3934
+
3935
+ # Conflicting, missing and erroneous options
3936
+ if opts.usenetrc and (opts.username is not None or opts.password is not None):
3937
+ parser.error(u'using .netrc conflicts with giving username/password')
3938
+ if opts.password is not None and opts.username is None:
3939
+ parser.error(u'account username missing')
3940
+ if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3941
+ parser.error(u'using output template conflicts with using title, literal title or auto number')
3942
+ if opts.usetitle and opts.useliteral:
3943
+ parser.error(u'using title conflicts with using literal title')
3944
+ if opts.username is not None and opts.password is None:
3945
+ opts.password = getpass.getpass(u'Type account password and press return:')
3946
+ if opts.ratelimit is not None:
3947
+ numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3948
+ if numeric_limit is None:
3949
+ parser.error(u'invalid rate limit specified')
3950
+ opts.ratelimit = numeric_limit
3951
+ if opts.retries is not None:
3952
+ try:
3953
+ opts.retries = long(opts.retries)
3954
+ except (TypeError, ValueError), err:
3955
+ parser.error(u'invalid retry count specified')
3956
+ try:
3957
+ opts.playliststart = int(opts.playliststart)
3958
+ if opts.playliststart <= 0:
3959
+ raise ValueError(u'Playlist start must be positive')
3960
+ except (TypeError, ValueError), err:
3961
+ parser.error(u'invalid playlist start number specified')
3962
+ try:
3963
+ opts.playlistend = int(opts.playlistend)
3964
+ if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3965
+ raise ValueError(u'Playlist end must be greater than playlist start')
3966
+ except (TypeError, ValueError), err:
3967
+ parser.error(u'invalid playlist end number specified')
3968
+ if opts.extractaudio:
3969
+ if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
3970
+ parser.error(u'invalid audio format specified')
3971
+
3972
+ # File downloader
3973
+ fd = FileDownloader({
3974
+ 'usenetrc': opts.usenetrc,
3975
+ 'username': opts.username,
3976
+ 'password': opts.password,
3977
+ 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3978
+ 'forceurl': opts.geturl,
3979
+ 'forcetitle': opts.gettitle,
3980
+ 'forcethumbnail': opts.getthumbnail,
3981
+ 'forcedescription': opts.getdescription,
3982
+ 'forcefilename': opts.getfilename,
3983
+ 'forceformat': opts.getformat,
3984
+ 'simulate': opts.simulate,
3985
+ 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3986
+ 'format': opts.format,
3987
+ 'format_limit': opts.format_limit,
3988
+ 'listformats': opts.listformats,
3989
+ 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3990
+ or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3991
+ or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3992
+ or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3993
+ or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3994
+ or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3995
+ or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3996
+ or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3997
+ or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3998
+ or u'%(id)s.%(ext)s'),
3999
+ 'ignoreerrors': opts.ignoreerrors,
4000
+ 'ratelimit': opts.ratelimit,
4001
+ 'nooverwrites': opts.nooverwrites,
4002
+ 'retries': opts.retries,
4003
+ 'continuedl': opts.continue_dl,
4004
+ 'noprogress': opts.noprogress,
4005
+ 'playliststart': opts.playliststart,
4006
+ 'playlistend': opts.playlistend,
4007
+ 'logtostderr': opts.outtmpl == '-',
4008
+ 'consoletitle': opts.consoletitle,
4009
+ 'nopart': opts.nopart,
4010
+ 'updatetime': opts.updatetime,
4011
+ 'writedescription': opts.writedescription,
4012
+ 'writeinfojson': opts.writeinfojson,
4013
+ 'matchtitle': opts.matchtitle,
4014
+ 'rejecttitle': opts.rejecttitle,
4015
+ })
4016
+ for extractor in extractors:
4017
+ fd.add_info_extractor(extractor)
4018
+
4019
+ # PostProcessors
4020
+ if opts.extractaudio:
4021
+ fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4022
+
4023
+ # Update version
4024
+ if opts.update_self:
4025
+ updateSelf(fd, sys.argv[0])
4026
+
4027
+ # Maybe do nothing
4028
+ if len(all_urls) < 1:
4029
+ if not opts.update_self:
4030
+ parser.error(u'you must provide at least one URL')
4031
+ else:
4032
+ sys.exit()
4033
+ retcode = fd.download(all_urls)
4034
+
4035
+ # Dump cookie jar if requested
4036
+ if opts.cookiefile is not None:
4037
+ try:
4038
+ jar.save()
4039
+ except (IOError, OSError), err:
4040
+ sys.exit(u'ERROR: unable to save cookie jar')
4041
+
4042
+ sys.exit(retcode)
4043
+
4044
+
4045
+ if __name__ == '__main__':
4046
+ try:
4047
+ main()
4048
+ except DownloadError:
4049
+ sys.exit(1)
4050
+ except SameFileError:
4051
+ sys.exit(u'ERROR: fixed output name but more than one file to download')
4052
+ except KeyboardInterrupt:
4053
+ sys.exit(u'\nERROR: Interrupted by user')
4054
+
4055
+ # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: