waymore 7.6__tar.gz → 7.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {waymore-7.6/waymore.egg-info → waymore-7.7}/PKG-INFO +2 -2
- {waymore-7.6 → waymore-7.7}/README.md +1 -1
- waymore-7.7/waymore/__init__.py +1 -0
- {waymore-7.6 → waymore-7.7}/waymore/waymore.py +591 -146
- {waymore-7.6 → waymore-7.7/waymore.egg-info}/PKG-INFO +2 -2
- waymore-7.6/waymore/__init__.py +0 -1
- {waymore-7.6 → waymore-7.7}/LICENSE +0 -0
- {waymore-7.6 → waymore-7.7}/pyproject.toml +0 -0
- {waymore-7.6 → waymore-7.7}/requirements.txt +0 -0
- {waymore-7.6 → waymore-7.7}/setup.cfg +0 -0
- {waymore-7.6 → waymore-7.7}/setup.py +0 -0
- {waymore-7.6 → waymore-7.7}/tests/test_import.py +0 -0
- {waymore-7.6 → waymore-7.7}/waymore.egg-info/SOURCES.txt +0 -0
- {waymore-7.6 → waymore-7.7}/waymore.egg-info/dependency_links.txt +0 -0
- {waymore-7.6 → waymore-7.7}/waymore.egg-info/entry_points.txt +0 -0
- {waymore-7.6 → waymore-7.7}/waymore.egg-info/requires.txt +0 -0
- {waymore-7.6 → waymore-7.7}/waymore.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: waymore
|
|
3
|
-
Version: 7.
|
|
3
|
+
Version: 7.7
|
|
4
4
|
Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan, VirusTotal & Intelligence X!
|
|
5
5
|
Home-page: https://github.com/xnl-h4ck3r/waymore
|
|
6
6
|
Author: xnl-h4ck3r
|
|
@@ -21,7 +21,7 @@ Dynamic: license-file
|
|
|
21
21
|
|
|
22
22
|
<center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
|
|
23
23
|
|
|
24
|
-
## About - v7.
|
|
24
|
+
## About - v7.7
|
|
25
25
|
|
|
26
26
|
The idea behind **waymore** is to find even more links from the Wayback Machine (plus other sources) than other existing tools.
|
|
27
27
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
|
|
2
2
|
|
|
3
|
-
## About - v7.
|
|
3
|
+
## About - v7.7
|
|
4
4
|
|
|
5
5
|
The idea behind **waymore** is to find even more links from the Wayback Machine (plus other sources) than other existing tools.
|
|
6
6
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "7.7"
|
|
@@ -247,10 +247,10 @@ DEFAULT_LIMIT = 5000
|
|
|
247
247
|
DEFAULT_TIMEOUT = 30
|
|
248
248
|
|
|
249
249
|
# Exclusions used to exclude responses we will try to get from web.archive.org
|
|
250
|
-
DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource"
|
|
250
|
+
DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource,.wmv,.wma,.asx"
|
|
251
251
|
|
|
252
252
|
# MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API
|
|
253
|
-
DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/pdf,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff"
|
|
253
|
+
DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/pdf,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff,video/x-ms-asf,audio/x-ms-wma,audio/wma,application/x-mplayer2"
|
|
254
254
|
|
|
255
255
|
# Response code exclusions we will use to filter links and responses from web.archive.org through their API
|
|
256
256
|
DEFAULT_FILTER_CODE = "404,301,302"
|
|
@@ -297,6 +297,298 @@ INLINE_JS_EXCLUDE = [
|
|
|
297
297
|
".json",
|
|
298
298
|
]
|
|
299
299
|
|
|
300
|
+
# Binary file extensions that should be saved as raw bytes, not text
|
|
301
|
+
BINARY_EXTENSIONS = frozenset(
|
|
302
|
+
[
|
|
303
|
+
".zip",
|
|
304
|
+
".gz",
|
|
305
|
+
".gzip",
|
|
306
|
+
".tar",
|
|
307
|
+
".rar",
|
|
308
|
+
".7z",
|
|
309
|
+
".bz2",
|
|
310
|
+
".xz",
|
|
311
|
+
".pdf",
|
|
312
|
+
".doc",
|
|
313
|
+
".docx",
|
|
314
|
+
".xls",
|
|
315
|
+
".xlsx",
|
|
316
|
+
".ppt",
|
|
317
|
+
".pptx",
|
|
318
|
+
".exe",
|
|
319
|
+
".msi",
|
|
320
|
+
".dll",
|
|
321
|
+
".bin",
|
|
322
|
+
".so",
|
|
323
|
+
".dmg",
|
|
324
|
+
".deb",
|
|
325
|
+
".rpm",
|
|
326
|
+
".png",
|
|
327
|
+
".jpg",
|
|
328
|
+
".jpeg",
|
|
329
|
+
".gif",
|
|
330
|
+
".bmp",
|
|
331
|
+
".ico",
|
|
332
|
+
".webp",
|
|
333
|
+
".svg",
|
|
334
|
+
".tiff",
|
|
335
|
+
".tif",
|
|
336
|
+
".mp3",
|
|
337
|
+
".mp4",
|
|
338
|
+
".wav",
|
|
339
|
+
".avi",
|
|
340
|
+
".mov",
|
|
341
|
+
".mkv",
|
|
342
|
+
".flv",
|
|
343
|
+
".wmv",
|
|
344
|
+
".webm",
|
|
345
|
+
".ogg",
|
|
346
|
+
".ttf",
|
|
347
|
+
".otf",
|
|
348
|
+
".woff",
|
|
349
|
+
".woff2",
|
|
350
|
+
".eot",
|
|
351
|
+
".class",
|
|
352
|
+
".jar",
|
|
353
|
+
".war",
|
|
354
|
+
".ear",
|
|
355
|
+
".pyc",
|
|
356
|
+
".pyo",
|
|
357
|
+
".o",
|
|
358
|
+
".a",
|
|
359
|
+
".lib",
|
|
360
|
+
".iso",
|
|
361
|
+
".img",
|
|
362
|
+
".sqlite",
|
|
363
|
+
".db",
|
|
364
|
+
".mdb",
|
|
365
|
+
".swf",
|
|
366
|
+
".fla",
|
|
367
|
+
]
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Binary MIME types that should be saved as raw bytes, not text
|
|
371
|
+
BINARY_MIME_TYPES = frozenset(
|
|
372
|
+
[
|
|
373
|
+
"application/zip",
|
|
374
|
+
"application/x-zip-compressed",
|
|
375
|
+
"application/x-gzip",
|
|
376
|
+
"application/gzip",
|
|
377
|
+
"application/x-tar",
|
|
378
|
+
"application/x-rar-compressed",
|
|
379
|
+
"application/x-7z-compressed",
|
|
380
|
+
"application/x-bzip2",
|
|
381
|
+
"application/x-xz",
|
|
382
|
+
"application/pdf",
|
|
383
|
+
"application/msword",
|
|
384
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
385
|
+
"application/vnd.ms-excel",
|
|
386
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
387
|
+
"application/vnd.ms-powerpoint",
|
|
388
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
389
|
+
"application/x-msdownload",
|
|
390
|
+
"application/x-msi",
|
|
391
|
+
"application/x-dosexec",
|
|
392
|
+
"application/octet-stream",
|
|
393
|
+
"image/png",
|
|
394
|
+
"image/jpeg",
|
|
395
|
+
"image/gif",
|
|
396
|
+
"image/bmp",
|
|
397
|
+
"image/x-icon",
|
|
398
|
+
"image/webp",
|
|
399
|
+
"image/tiff",
|
|
400
|
+
"audio/mpeg",
|
|
401
|
+
"audio/wav",
|
|
402
|
+
"audio/ogg",
|
|
403
|
+
"audio/webm",
|
|
404
|
+
"video/mp4",
|
|
405
|
+
"video/avi",
|
|
406
|
+
"video/quicktime",
|
|
407
|
+
"video/x-msvideo",
|
|
408
|
+
"video/x-matroska",
|
|
409
|
+
"video/webm",
|
|
410
|
+
"video/ogg",
|
|
411
|
+
"font/ttf",
|
|
412
|
+
"font/otf",
|
|
413
|
+
"font/woff",
|
|
414
|
+
"font/woff2",
|
|
415
|
+
"application/x-font-ttf",
|
|
416
|
+
"application/x-font-otf",
|
|
417
|
+
"application/font-woff",
|
|
418
|
+
"application/font-woff2",
|
|
419
|
+
"application/java-archive",
|
|
420
|
+
"application/x-java-class",
|
|
421
|
+
"application/x-shockwave-flash",
|
|
422
|
+
"application/x-sqlite3",
|
|
423
|
+
"application/x-iso9660-image",
|
|
424
|
+
]
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def isBinaryContent(contentBytes, contentType, url=""):
|
|
429
|
+
"""
|
|
430
|
+
Determine if content should be treated as binary based on actual content, Content-Type, and URL.
|
|
431
|
+
|
|
432
|
+
Priority (highest to lowest):
|
|
433
|
+
1. Content inspection - check for text signatures (most reliable)
|
|
434
|
+
2. Content-Type header
|
|
435
|
+
3. URL extension (least reliable - archive might have captured an HTML error page)
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
contentBytes: The raw response bytes (at least first 100 bytes)
|
|
439
|
+
contentType: The Content-Type header value
|
|
440
|
+
url: The URL (optional, used as fallback)
|
|
441
|
+
|
|
442
|
+
Returns True if content is binary and should be saved as raw bytes.
|
|
443
|
+
"""
|
|
444
|
+
# STEP 1: Check actual content for text signatures (most reliable)
|
|
445
|
+
# If content starts with text markers, it's definitely NOT binary regardless of extension
|
|
446
|
+
try:
|
|
447
|
+
if contentBytes and len(contentBytes) > 0:
|
|
448
|
+
# Get first 100 bytes and strip leading whitespace/newlines for checking
|
|
449
|
+
preview = contentBytes[:100].lstrip()
|
|
450
|
+
previewLower = preview.lower()
|
|
451
|
+
|
|
452
|
+
# Common text file signatures
|
|
453
|
+
textSignatures = [
|
|
454
|
+
b"<!doctype", # HTML doctype
|
|
455
|
+
b"<html", # HTML tag
|
|
456
|
+
b"<head", # HTML head
|
|
457
|
+
b"<body", # HTML body
|
|
458
|
+
b"<?xml", # XML declaration
|
|
459
|
+
b"<svg", # SVG image (actually XML text)
|
|
460
|
+
b"{", # JSON object
|
|
461
|
+
b"[", # JSON array
|
|
462
|
+
b"/*", # CSS/JS comment
|
|
463
|
+
b"//", # JS comment
|
|
464
|
+
b"#!", # Shebang (shell scripts)
|
|
465
|
+
b"var ", # JavaScript
|
|
466
|
+
b"let ", # JavaScript
|
|
467
|
+
b"const ", # JavaScript
|
|
468
|
+
b"function", # JavaScript
|
|
469
|
+
b"import ", # JavaScript/Python
|
|
470
|
+
b"export ", # JavaScript
|
|
471
|
+
b"class ", # Various languages
|
|
472
|
+
b"def ", # Python
|
|
473
|
+
]
|
|
474
|
+
|
|
475
|
+
for sig in textSignatures:
|
|
476
|
+
if previewLower.startswith(sig):
|
|
477
|
+
return False # Definitely text, not binary
|
|
478
|
+
|
|
479
|
+
# Check for binary file magic bytes (file signatures)
|
|
480
|
+
binarySignatures = [
|
|
481
|
+
b"%PDF", # PDF
|
|
482
|
+
b"PK\x03\x04", # ZIP, DOCX, XLSX, etc.
|
|
483
|
+
b"PK\x05\x06", # Empty ZIP
|
|
484
|
+
b"\x1f\x8b", # GZIP
|
|
485
|
+
b"\x89PNG", # PNG
|
|
486
|
+
b"\xff\xd8\xff", # JPEG
|
|
487
|
+
b"GIF87a", # GIF
|
|
488
|
+
b"GIF89a", # GIF
|
|
489
|
+
b"BM", # BMP (check at start)
|
|
490
|
+
b"RIFF", # WAV, AVI, WebP
|
|
491
|
+
b"\x00\x00\x00", # Various binary formats (MP4, etc.)
|
|
492
|
+
b"ID3", # MP3 with ID3 tag
|
|
493
|
+
b"\xff\xfb", # MP3
|
|
494
|
+
b"\xff\xfa", # MP3
|
|
495
|
+
b"OggS", # OGG
|
|
496
|
+
b"\x4d\x5a", # EXE/DLL (MZ header)
|
|
497
|
+
b"\x7fELF", # Linux ELF binary
|
|
498
|
+
b"\xca\xfe\xba\xbe", # Java class file
|
|
499
|
+
b"\x30\x26\xb2\x75", # ASF/WMV/WMA (first 4 bytes of ASF GUID)
|
|
500
|
+
b"FLV\x01", # FLV (Flash Video)
|
|
501
|
+
b"ftyp", # MP4/M4A/MOV (after 4 byte size prefix)
|
|
502
|
+
b"Rar!\x1a\x07", # RAR archive
|
|
503
|
+
b"7z\xbc\xaf\x27\x1c", # 7-Zip archive
|
|
504
|
+
b"\x1a\x45\xdf\xa3", # WebM/MKV (EBML)
|
|
505
|
+
b"II\x2a\x00", # TIFF (Intel byte order)
|
|
506
|
+
b"MM\x00\x2a", # TIFF (Motorola byte order)
|
|
507
|
+
b"\x00\x00\x01\x00", # ICO (Windows Icon)
|
|
508
|
+
b"\x00\x00\x02\x00", # CUR (Windows Cursor)
|
|
509
|
+
b"wOFF", # WOFF font
|
|
510
|
+
b"wOF2", # WOFF2 font
|
|
511
|
+
b"FWS", # SWF (uncompressed Flash)
|
|
512
|
+
b"CWS", # SWF (zlib compressed Flash)
|
|
513
|
+
b"ZWS", # SWF (LZMA compressed Flash)
|
|
514
|
+
b"\x00\x01\x00\x00", # TrueType font
|
|
515
|
+
b"OTTO", # OpenType font with CFF
|
|
516
|
+
]
|
|
517
|
+
|
|
518
|
+
for sig in binarySignatures:
|
|
519
|
+
if preview.startswith(sig):
|
|
520
|
+
return True # Definitely binary
|
|
521
|
+
except Exception:
|
|
522
|
+
pass
|
|
523
|
+
|
|
524
|
+
# STEP 2: Check Content-Type header
|
|
525
|
+
try:
|
|
526
|
+
if contentType:
|
|
527
|
+
mimeType = contentType.lower().split(";")[0].strip()
|
|
528
|
+
|
|
529
|
+
# Explicit text types
|
|
530
|
+
textMimeTypes = [
|
|
531
|
+
"text/html",
|
|
532
|
+
"text/plain",
|
|
533
|
+
"text/css",
|
|
534
|
+
"text/javascript",
|
|
535
|
+
"text/xml",
|
|
536
|
+
"text/csv",
|
|
537
|
+
"text/markdown",
|
|
538
|
+
"application/json",
|
|
539
|
+
"application/javascript",
|
|
540
|
+
"application/xml",
|
|
541
|
+
"application/xhtml+xml",
|
|
542
|
+
"application/rss+xml",
|
|
543
|
+
"application/atom+xml",
|
|
544
|
+
]
|
|
545
|
+
if mimeType in textMimeTypes or mimeType.startswith("text/"):
|
|
546
|
+
return False # Text type
|
|
547
|
+
|
|
548
|
+
# Known binary types
|
|
549
|
+
if mimeType in BINARY_MIME_TYPES:
|
|
550
|
+
return True
|
|
551
|
+
|
|
552
|
+
# Generic binary prefixes
|
|
553
|
+
if (
|
|
554
|
+
mimeType.startswith("image/")
|
|
555
|
+
or mimeType.startswith("audio/")
|
|
556
|
+
or mimeType.startswith("video/")
|
|
557
|
+
):
|
|
558
|
+
return True
|
|
559
|
+
if mimeType.startswith("application/") and mimeType not in textMimeTypes:
|
|
560
|
+
# application/* is often binary, but not always - be conservative
|
|
561
|
+
if "octet-stream" in mimeType or "binary" in mimeType:
|
|
562
|
+
return True
|
|
563
|
+
except Exception:
|
|
564
|
+
pass
|
|
565
|
+
|
|
566
|
+
# STEP 3: Check URL extension as last resort
|
|
567
|
+
try:
|
|
568
|
+
if url:
|
|
569
|
+
# Extract actual URL from prefixed formats (Wayback/URLScan)
|
|
570
|
+
actualUrl = url
|
|
571
|
+
httpPos = url.find("http://")
|
|
572
|
+
httpsPos = url.find("https://")
|
|
573
|
+
if httpsPos >= 0:
|
|
574
|
+
actualUrl = url[httpsPos:]
|
|
575
|
+
elif httpPos >= 0:
|
|
576
|
+
actualUrl = url[httpPos:]
|
|
577
|
+
|
|
578
|
+
parsed = urlparse(actualUrl.strip())
|
|
579
|
+
path = parsed.path.lower()
|
|
580
|
+
if "." in path:
|
|
581
|
+
ext = "." + path.rsplit(".", 1)[-1]
|
|
582
|
+
if "?" in ext:
|
|
583
|
+
ext = ext.split("?")[0]
|
|
584
|
+
if ext in BINARY_EXTENSIONS:
|
|
585
|
+
return True
|
|
586
|
+
except Exception:
|
|
587
|
+
pass
|
|
588
|
+
|
|
589
|
+
# Default: treat as text (safer - text processing won't corrupt text)
|
|
590
|
+
return False
|
|
591
|
+
|
|
300
592
|
|
|
301
593
|
# Get memory usage for
|
|
302
594
|
def getMemory():
|
|
@@ -955,16 +1247,12 @@ def showOptions():
|
|
|
955
1247
|
)
|
|
956
1248
|
)
|
|
957
1249
|
|
|
1250
|
+
# Only show --source-ip if it's explicitly configured
|
|
958
1251
|
if SOURCE_IP:
|
|
959
1252
|
write(
|
|
960
1253
|
colored("--source-ip: " + str(SOURCE_IP), "magenta")
|
|
961
1254
|
+ colored(" Outbound requests will bind to this IP.", "white")
|
|
962
1255
|
)
|
|
963
|
-
else:
|
|
964
|
-
write(
|
|
965
|
-
colored("--source-ip: default", "magenta")
|
|
966
|
-
+ colored(" Outbound IP determined by OS routing table.", "white")
|
|
967
|
-
)
|
|
968
1256
|
|
|
969
1257
|
write()
|
|
970
1258
|
|
|
@@ -1497,6 +1785,63 @@ def fixArchiveOrgUrl(url):
|
|
|
1497
1785
|
return url
|
|
1498
1786
|
|
|
1499
1787
|
|
|
1788
|
+
def isLikelyBinaryUrl(url):
|
|
1789
|
+
"""
|
|
1790
|
+
Check if a URL likely points to a binary file based on its extension.
|
|
1791
|
+
This is used BEFORE making a request to decide if we need the raw/id_ version.
|
|
1792
|
+
"""
|
|
1793
|
+
try:
|
|
1794
|
+
# Extract actual URL from prefixed formats (Wayback timestamp/URLScan UUID)
|
|
1795
|
+
actualUrl = url
|
|
1796
|
+
httpPos = url.find("http://")
|
|
1797
|
+
httpsPos = url.find("https://")
|
|
1798
|
+
if httpsPos >= 0:
|
|
1799
|
+
actualUrl = url[httpsPos:]
|
|
1800
|
+
elif httpPos >= 0:
|
|
1801
|
+
actualUrl = url[httpPos:]
|
|
1802
|
+
|
|
1803
|
+
parsed = urlparse(actualUrl.strip())
|
|
1804
|
+
path = parsed.path.lower()
|
|
1805
|
+
if "." in path:
|
|
1806
|
+
ext = "." + path.rsplit(".", 1)[-1]
|
|
1807
|
+
if "?" in ext:
|
|
1808
|
+
ext = ext.split("?")[0]
|
|
1809
|
+
if ext in BINARY_EXTENSIONS:
|
|
1810
|
+
return True
|
|
1811
|
+
except Exception:
|
|
1812
|
+
pass
|
|
1813
|
+
return False
|
|
1814
|
+
|
|
1815
|
+
|
|
1816
|
+
def addRawModifier(archiveUrl):
|
|
1817
|
+
"""
|
|
1818
|
+
Add 'id_' modifier to Wayback Machine URL to get raw/original content.
|
|
1819
|
+
This is essential for binary files to avoid Wayback modifications.
|
|
1820
|
+
|
|
1821
|
+
Example:
|
|
1822
|
+
Input: https://web.archive.org/web/20090315210455/http://example.com/file.wmv
|
|
1823
|
+
Output: https://web.archive.org/web/20090315210455id_/http://example.com/file.wmv
|
|
1824
|
+
"""
|
|
1825
|
+
try:
|
|
1826
|
+
# Find the timestamp in the URL (14 digits after /web/)
|
|
1827
|
+
webPos = archiveUrl.find("/web/")
|
|
1828
|
+
if webPos >= 0:
|
|
1829
|
+
# Find where the timestamp ends (first / after /web/)
|
|
1830
|
+
afterWeb = webPos + 5 # Position after "/web/"
|
|
1831
|
+
slashAfterTimestamp = archiveUrl.find("/", afterWeb)
|
|
1832
|
+
if slashAfterTimestamp > afterWeb:
|
|
1833
|
+
# Insert id_ before the slash after timestamp
|
|
1834
|
+
timestamp = archiveUrl[afterWeb:slashAfterTimestamp]
|
|
1835
|
+
# Only add id_ if it's not already there
|
|
1836
|
+
if not timestamp.endswith("id_"):
|
|
1837
|
+
return (
|
|
1838
|
+
archiveUrl[:slashAfterTimestamp] + "id_" + archiveUrl[slashAfterTimestamp:]
|
|
1839
|
+
)
|
|
1840
|
+
except Exception:
|
|
1841
|
+
pass
|
|
1842
|
+
return archiveUrl
|
|
1843
|
+
|
|
1844
|
+
|
|
1500
1845
|
# Add a link to the linksFound collection for archived responses (included timestamp preifx)
|
|
1501
1846
|
def linksFoundResponseAdd(link):
|
|
1502
1847
|
global linksFound, argsInput, argsInputHostname, links_lock
|
|
@@ -1581,6 +1926,12 @@ def processArchiveUrl(url):
|
|
|
1581
1926
|
if stopProgram is None:
|
|
1582
1927
|
|
|
1583
1928
|
archiveUrl = "https://web.archive.org/web/" + fixArchiveOrgUrl(url)
|
|
1929
|
+
|
|
1930
|
+
# For binary files, add id_ modifier to get raw/original content
|
|
1931
|
+
# This prevents Wayback Machine from modifying the content
|
|
1932
|
+
if isLikelyBinaryUrl(url):
|
|
1933
|
+
archiveUrl = addRawModifier(archiveUrl)
|
|
1934
|
+
|
|
1584
1935
|
hashValue = ""
|
|
1585
1936
|
|
|
1586
1937
|
# Get memory usage every 100 responses
|
|
@@ -1593,6 +1944,18 @@ def processArchiveUrl(url):
|
|
|
1593
1944
|
# Make a request to the web archive
|
|
1594
1945
|
try:
|
|
1595
1946
|
try:
|
|
1947
|
+
try:
|
|
1948
|
+
if os.environ.get("USER") == "xnl":
|
|
1949
|
+
writerr(
|
|
1950
|
+
colored(
|
|
1951
|
+
"[ DBG ] Requesting file " + archiveUrl,
|
|
1952
|
+
"yellow",
|
|
1953
|
+
attrs=["dark"],
|
|
1954
|
+
)
|
|
1955
|
+
)
|
|
1956
|
+
except Exception:
|
|
1957
|
+
pass
|
|
1958
|
+
|
|
1596
1959
|
# Choose a random user agent string to use for any requests
|
|
1597
1960
|
userAgent = random.choice(USER_AGENT)
|
|
1598
1961
|
|
|
@@ -1604,146 +1967,175 @@ def processArchiveUrl(url):
|
|
|
1604
1967
|
headers={"User-Agent": userAgent},
|
|
1605
1968
|
allow_redirects=True,
|
|
1606
1969
|
)
|
|
1607
|
-
|
|
1970
|
+
|
|
1971
|
+
# Get raw content bytes first
|
|
1972
|
+
contentBytes = resp.content
|
|
1973
|
+
|
|
1608
1974
|
try:
|
|
1609
|
-
contentType = resp.headers.get("Content-Type").split(";")[0].lower()
|
|
1975
|
+
contentType = resp.headers.get("Content-Type", "").split(";")[0].lower()
|
|
1610
1976
|
except Exception:
|
|
1611
1977
|
contentType = ""
|
|
1612
1978
|
|
|
1979
|
+
# Determine if this is binary content based on actual content, Content-Type, and URL
|
|
1980
|
+
isBinary = isBinaryContent(contentBytes, contentType, url)
|
|
1981
|
+
|
|
1982
|
+
if isBinary:
|
|
1983
|
+
# For binary files, use raw bytes as-is
|
|
1984
|
+
archiveContent = contentBytes
|
|
1985
|
+
archiveHtml = None # Not used for binary files
|
|
1986
|
+
else:
|
|
1987
|
+
# For text files, decode to string
|
|
1988
|
+
archiveHtml = contentBytes.decode("utf-8", errors="replace")
|
|
1989
|
+
archiveContent = None # Not used for text files
|
|
1990
|
+
|
|
1613
1991
|
# Only create a file if there is a response
|
|
1614
|
-
if len(archiveHtml)
|
|
1992
|
+
responseLength = len(archiveContent) if isBinary else len(archiveHtml)
|
|
1993
|
+
if responseLength != 0:
|
|
1615
1994
|
|
|
1995
|
+
# For text files, check for custom 404 pages
|
|
1616
1996
|
# If the FILTER_CODE doesn't include 404, OR
|
|
1617
1997
|
# If the FILTER_CODE includes 404, and it doesn't seem to be a custom 404 page
|
|
1618
|
-
if
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
archiveHtml = (
|
|
1626
|
-
"/* Original URL: " + archiveUrl + " */\n" + archiveHtml
|
|
1998
|
+
if (
|
|
1999
|
+
isBinary
|
|
2000
|
+
or "404" not in FILTER_CODE
|
|
2001
|
+
or (
|
|
2002
|
+
"404" in FILTER_CODE
|
|
2003
|
+
and not re.findall(
|
|
2004
|
+
REGEX_404, archiveHtml, re.DOTALL | re.IGNORECASE
|
|
1627
2005
|
)
|
|
1628
|
-
|
|
1629
|
-
# Remove all web archive references in the response
|
|
1630
|
-
archiveHtml = re.sub(
|
|
1631
|
-
r'\<script type=\"text\/javascript" src=\"\/_static\/js\/bundle-playback\.js\?v=[A-Za-z0-9]*" charset="utf-8"><\/script>\n<script type="text\/javascript" src="\/_static\/js\/wombat\.js.*\<\!-- End Wayback Rewrite JS Include --\>',
|
|
1632
|
-
"",
|
|
1633
|
-
archiveHtml,
|
|
1634
|
-
1,
|
|
1635
|
-
flags=re.DOTALL | re.IGNORECASE,
|
|
1636
|
-
)
|
|
1637
|
-
archiveHtml = re.sub(
|
|
1638
|
-
r"\<script src=\"\/\/archive\.org.*\<\!-- End Wayback Rewrite JS Include --\>",
|
|
1639
|
-
"",
|
|
1640
|
-
archiveHtml,
|
|
1641
|
-
1,
|
|
1642
|
-
flags=re.DOTALL | re.IGNORECASE,
|
|
1643
|
-
)
|
|
1644
|
-
archiveHtml = re.sub(
|
|
1645
|
-
r"\<script\>window\.RufflePlayer[^\<]*\<\/script\>",
|
|
1646
|
-
"",
|
|
1647
|
-
archiveHtml,
|
|
1648
|
-
1,
|
|
1649
|
-
flags=re.DOTALL | re.IGNORECASE,
|
|
1650
|
-
)
|
|
1651
|
-
archiveHtml = re.sub(
|
|
1652
|
-
r"\<\!-- BEGIN WAYBACK TOOLBAR INSERT --\>.*\<\!-- END WAYBACK TOOLBAR INSERT --\>",
|
|
1653
|
-
"",
|
|
1654
|
-
archiveHtml,
|
|
1655
|
-
1,
|
|
1656
|
-
flags=re.DOTALL | re.IGNORECASE,
|
|
1657
|
-
)
|
|
1658
|
-
archiveHtml = re.sub(
|
|
1659
|
-
r"(}\n)?(\/\*|<!--\n)\s*FILE ARCHIVED ON.*108\(a\)\(3\)\)\.\n(\*\/|-->)",
|
|
1660
|
-
"",
|
|
1661
|
-
archiveHtml,
|
|
1662
|
-
1,
|
|
1663
|
-
flags=re.DOTALL | re.IGNORECASE,
|
|
1664
|
-
)
|
|
1665
|
-
archiveHtml = re.sub(
|
|
1666
|
-
r"var\s_____WB\$wombat\$assign\$function.*WB\$wombat\$assign\$function_____\(\"opener\"\);",
|
|
1667
|
-
"",
|
|
1668
|
-
archiveHtml,
|
|
1669
|
-
1,
|
|
1670
|
-
flags=re.DOTALL | re.IGNORECASE,
|
|
1671
|
-
)
|
|
1672
|
-
archiveHtml = re.sub(
|
|
1673
|
-
r"(\<\!--|\/\*)\nplayback timings.*(--\>|\*\/)",
|
|
1674
|
-
"",
|
|
1675
|
-
archiveHtml,
|
|
1676
|
-
1,
|
|
1677
|
-
flags=re.DOTALL | re.IGNORECASE,
|
|
1678
|
-
)
|
|
1679
|
-
archiveHtml = re.sub(
|
|
1680
|
-
r"((https:)?\/\/web\.archive\.org)?\/web\/[0-9]{14}([A-Za-z]{2}\_)?\/",
|
|
1681
|
-
"",
|
|
1682
|
-
archiveHtml,
|
|
1683
|
-
flags=re.IGNORECASE,
|
|
1684
|
-
)
|
|
1685
|
-
archiveHtml = re.sub(
|
|
1686
|
-
r"((https:)?\\\/\\\/web\.archive\.org)?\\\/web\\\/[0-9]{14}([A-Za-z]{2}\_)?\\\/",
|
|
1687
|
-
"",
|
|
1688
|
-
archiveHtml,
|
|
1689
|
-
flags=re.IGNORECASE,
|
|
1690
|
-
)
|
|
1691
|
-
archiveHtml = re.sub(
|
|
1692
|
-
r"((https:)?%2F%2Fweb\.archive\.org)?%2Fweb%2F[0-9]{14}([A-Za-z]{2}\_)?%2F",
|
|
1693
|
-
"",
|
|
1694
|
-
archiveHtml,
|
|
1695
|
-
flags=re.IGNORECASE,
|
|
1696
|
-
)
|
|
1697
|
-
archiveHtml = re.sub(
|
|
1698
|
-
r"((https:)?\\u002F\\u002Fweb\.archive\.org)?\\u002Fweb\\u002F[0-9]{14}([A-Za-z]{2}\_)?\\u002F",
|
|
1699
|
-
"",
|
|
1700
|
-
archiveHtml,
|
|
1701
|
-
flags=re.IGNORECASE,
|
|
1702
|
-
)
|
|
1703
|
-
archiveHtml = re.sub(
|
|
1704
|
-
r"\<script type=\"text\/javascript\">\s*__wm\.init\(\"https:\/\/web\.archive\.org\/web\"\);[^\<]*\<\/script\>",
|
|
1705
|
-
"",
|
|
1706
|
-
archiveHtml,
|
|
1707
|
-
flags=re.IGNORECASE,
|
|
1708
|
-
)
|
|
1709
|
-
archiveHtml = re.sub(
|
|
1710
|
-
r'\<script type=\"text\/javascript\" src="https:\/\/web-static\.archive\.org[^\<]*\<\/script\>',
|
|
1711
|
-
"",
|
|
1712
|
-
archiveHtml,
|
|
1713
|
-
flags=re.IGNORECASE,
|
|
1714
|
-
)
|
|
1715
|
-
archiveHtml = re.sub(
|
|
1716
|
-
r"\<link rel=\"stylesheet\" type=\"text\/css\" href=\"https:\/\/web-static\.archive\.org[^\<]*\/\>",
|
|
1717
|
-
"",
|
|
1718
|
-
archiveHtml,
|
|
1719
|
-
flags=re.IGNORECASE,
|
|
1720
|
-
)
|
|
1721
|
-
archiveHtml = re.sub(
|
|
1722
|
-
r"\<\!-- End Wayback Rewrite JS Include --\>",
|
|
1723
|
-
"",
|
|
1724
|
-
archiveHtml,
|
|
1725
|
-
re.IGNORECASE,
|
|
1726
2006
|
)
|
|
2007
|
+
):
|
|
1727
2008
|
|
|
1728
|
-
#
|
|
1729
|
-
if
|
|
1730
|
-
|
|
1731
|
-
|
|
2009
|
+
# For text files only: Add URL comment and clean up wayback references
|
|
2010
|
+
if not isBinary:
|
|
2011
|
+
# Add the URL as a comment at the start of the response
|
|
2012
|
+
if args.url_filename:
|
|
2013
|
+
archiveHtml = (
|
|
2014
|
+
"/* Original URL: " + archiveUrl + " */\n" + archiveHtml
|
|
2015
|
+
)
|
|
2016
|
+
|
|
2017
|
+
# Remove all web archive references in the response
|
|
2018
|
+
archiveHtml = re.sub(
|
|
2019
|
+
r'\<script type=\"text\/javascript" src=\"\/_static\/js\/bundle-playback\.js\?v=[A-Za-z0-9]*" charset="utf-8"\><\/script>\n<script type="text\/javascript" src="\/_static\/js\/wombat\.js.*\<\!-- End Wayback Rewrite JS Include --\>',
|
|
2020
|
+
"",
|
|
2021
|
+
archiveHtml,
|
|
2022
|
+
1,
|
|
2023
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
1732
2024
|
)
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
"
|
|
2025
|
+
archiveHtml = re.sub(
|
|
2026
|
+
r"\<script src=\"\/\/archive\.org.*\<\!-- End Wayback Rewrite JS Include --\>",
|
|
2027
|
+
"",
|
|
2028
|
+
archiveHtml,
|
|
2029
|
+
1,
|
|
2030
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
2031
|
+
)
|
|
2032
|
+
archiveHtml = re.sub(
|
|
2033
|
+
r"\<script\>window\.RufflePlayer[^\<]*\<\/script\>",
|
|
2034
|
+
"",
|
|
2035
|
+
archiveHtml,
|
|
2036
|
+
1,
|
|
2037
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
2038
|
+
)
|
|
2039
|
+
archiveHtml = re.sub(
|
|
2040
|
+
r"\<\!-- BEGIN WAYBACK TOOLBAR INSERT --\>.*\<\!-- END WAYBACK TOOLBAR INSERT --\>",
|
|
2041
|
+
"",
|
|
2042
|
+
archiveHtml,
|
|
2043
|
+
1,
|
|
2044
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
2045
|
+
)
|
|
2046
|
+
archiveHtml = re.sub(
|
|
2047
|
+
r"(}\n)?(\/\*|<\!--\n)\s*FILE ARCHIVED ON.*108\(a\)\(3\)\)\.\n(\*\/|--\>)",
|
|
2048
|
+
"",
|
|
2049
|
+
archiveHtml,
|
|
2050
|
+
1,
|
|
2051
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
2052
|
+
)
|
|
2053
|
+
archiveHtml = re.sub(
|
|
2054
|
+
r"var\s_____WB\$wombat\$assign\$function.*WB\$wombat\$assign\$function_____\(\"opener\"\);",
|
|
2055
|
+
"",
|
|
2056
|
+
archiveHtml,
|
|
2057
|
+
1,
|
|
2058
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
2059
|
+
)
|
|
2060
|
+
archiveHtml = re.sub(
|
|
2061
|
+
r"(\<\!--|\/\*)\nplayback timings.*(--\>|\*\/)",
|
|
2062
|
+
"",
|
|
2063
|
+
archiveHtml,
|
|
2064
|
+
1,
|
|
2065
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
2066
|
+
)
|
|
2067
|
+
archiveHtml = re.sub(
|
|
2068
|
+
r"((https:)?\/\/web\.archive\.org)?\/web\/[0-9]{14}([A-Za-z]{2}\_)?\/",
|
|
2069
|
+
"",
|
|
2070
|
+
archiveHtml,
|
|
2071
|
+
flags=re.IGNORECASE,
|
|
2072
|
+
)
|
|
2073
|
+
archiveHtml = re.sub(
|
|
2074
|
+
r"((https:)?\\\/\\\/web\.archive\.org)?\\\/web\\\/[0-9]{14}([A-Za-z]{2}\_)?\\\/",
|
|
2075
|
+
"",
|
|
2076
|
+
archiveHtml,
|
|
2077
|
+
flags=re.IGNORECASE,
|
|
2078
|
+
)
|
|
2079
|
+
archiveHtml = re.sub(
|
|
2080
|
+
r"((https:)?%2F%2Fweb\.archive\.org)?%2Fweb%2F[0-9]{14}([A-Za-z]{2}\_)?%2F",
|
|
2081
|
+
"",
|
|
2082
|
+
archiveHtml,
|
|
2083
|
+
flags=re.IGNORECASE,
|
|
2084
|
+
)
|
|
2085
|
+
archiveHtml = re.sub(
|
|
2086
|
+
r"((https:)?\\u002F\\u002Fweb\.archive\.org)?\\u002Fweb\\u002F[0-9]{14}([A-Za-z]{2}\_)?\\u002F",
|
|
2087
|
+
"",
|
|
2088
|
+
archiveHtml,
|
|
2089
|
+
flags=re.IGNORECASE,
|
|
2090
|
+
)
|
|
2091
|
+
archiveHtml = re.sub(
|
|
2092
|
+
r"\<script type=\"text\/javascript\"\>\s*__wm\.init\(\"https:\/\/web\.archive\.org\/web\"\);[^\<]*\<\/script\>",
|
|
2093
|
+
"",
|
|
2094
|
+
archiveHtml,
|
|
2095
|
+
flags=re.IGNORECASE,
|
|
2096
|
+
)
|
|
2097
|
+
archiveHtml = re.sub(
|
|
2098
|
+
r'\<script type=\"text\/javascript\" src="https:\/\/web-static\.archive\.org[^\<]*\<\/script\>',
|
|
2099
|
+
"",
|
|
2100
|
+
archiveHtml,
|
|
2101
|
+
flags=re.IGNORECASE,
|
|
2102
|
+
)
|
|
2103
|
+
archiveHtml = re.sub(
|
|
2104
|
+
r"\<link rel=\"stylesheet\" type=\"text\/css\" href=\"https:\/\/web-static\.archive\.org[^\<]*\/\>",
|
|
2105
|
+
"",
|
|
2106
|
+
archiveHtml,
|
|
2107
|
+
flags=re.IGNORECASE,
|
|
2108
|
+
)
|
|
2109
|
+
archiveHtml = re.sub(
|
|
2110
|
+
r"\<\!-- End Wayback Rewrite JS Include --\>",
|
|
2111
|
+
"",
|
|
2112
|
+
archiveHtml,
|
|
2113
|
+
re.IGNORECASE,
|
|
1736
2114
|
)
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
2115
|
+
|
|
2116
|
+
# If there is a specific Wayback error in the response, raise an exception
|
|
2117
|
+
if (
|
|
2118
|
+
archiveHtml.lower().find(
|
|
2119
|
+
"wayback machine has not archived that url"
|
|
2120
|
+
)
|
|
2121
|
+
> 0
|
|
2122
|
+
or archiveHtml.lower().find(
|
|
2123
|
+
"snapshot cannot be displayed due to an internal error"
|
|
2124
|
+
)
|
|
2125
|
+
> 0
|
|
2126
|
+
):
|
|
2127
|
+
raise WayBackException
|
|
1740
2128
|
|
|
1741
2129
|
# Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
|
|
1742
2130
|
if args.url_filename:
|
|
1743
2131
|
fileName = url.replace("/", "-").replace(":", "")
|
|
1744
2132
|
fileName = fileName[0:254]
|
|
1745
2133
|
else:
|
|
1746
|
-
|
|
2134
|
+
# For binary files, hash the raw bytes; for text, hash the text
|
|
2135
|
+
if isBinary:
|
|
2136
|
+
hashValue = filehash(archiveContent.hex())
|
|
2137
|
+
else:
|
|
2138
|
+
hashValue = filehash(archiveHtml)
|
|
1747
2139
|
fileName = hashValue
|
|
1748
2140
|
|
|
1749
2141
|
# Determine extension of file from the content-type using the mimetypes library
|
|
@@ -1785,11 +2177,15 @@ def processArchiveUrl(url):
|
|
|
1785
2177
|
extension = "css"
|
|
1786
2178
|
elif "pdf" in extension:
|
|
1787
2179
|
extension = "pdf"
|
|
2180
|
+
elif "zip" in extension:
|
|
2181
|
+
extension = "zip"
|
|
2182
|
+
elif "gzip" in extension or "x-gzip" in extension:
|
|
2183
|
+
extension = "gz"
|
|
1788
2184
|
elif "plain" == extension:
|
|
1789
2185
|
extension = "txt"
|
|
1790
2186
|
|
|
1791
2187
|
# If extension is still blank, set to html if the content ends with HTML tag, otherwise set to unknown
|
|
1792
|
-
if extension == "":
|
|
2188
|
+
if extension == "" and not isBinary:
|
|
1793
2189
|
if (
|
|
1794
2190
|
archiveHtml.lower().strip().endswith("</html>")
|
|
1795
2191
|
or archiveHtml.lower()
|
|
@@ -1800,6 +2196,8 @@ def processArchiveUrl(url):
|
|
|
1800
2196
|
extension = "html"
|
|
1801
2197
|
else:
|
|
1802
2198
|
extension = "unknown"
|
|
2199
|
+
elif extension == "" and isBinary:
|
|
2200
|
+
extension = "bin"
|
|
1803
2201
|
|
|
1804
2202
|
fileName = fileName + "." + extension
|
|
1805
2203
|
|
|
@@ -1816,10 +2214,14 @@ def processArchiveUrl(url):
|
|
|
1816
2214
|
+ f"{fileName}"
|
|
1817
2215
|
)
|
|
1818
2216
|
|
|
1819
|
-
# Write the file
|
|
2217
|
+
# Write the file - binary mode for binary files, text mode for text files
|
|
1820
2218
|
try:
|
|
1821
|
-
|
|
1822
|
-
|
|
2219
|
+
if isBinary:
|
|
2220
|
+
responseFile = open(filePath, "wb")
|
|
2221
|
+
responseFile.write(archiveContent)
|
|
2222
|
+
else:
|
|
2223
|
+
responseFile = open(filePath, "w", encoding="utf8")
|
|
2224
|
+
responseFile.write(archiveHtml)
|
|
1823
2225
|
responseFile.close()
|
|
1824
2226
|
fileCount = fileCount + 1
|
|
1825
2227
|
except Exception as e:
|
|
@@ -1852,9 +2254,10 @@ def processArchiveUrl(url):
|
|
|
1852
2254
|
)
|
|
1853
2255
|
)
|
|
1854
2256
|
|
|
1855
|
-
# FOR DEBUGGING PURPOSES
|
|
2257
|
+
# FOR DEBUGGING PURPOSES (only for text files)
|
|
1856
2258
|
try:
|
|
1857
|
-
if os.environ.get("USER") == "xnl":
|
|
2259
|
+
if os.environ.get("USER") == "xnl" and not isBinary:
|
|
2260
|
+
|
|
1858
2261
|
debugText = ""
|
|
1859
2262
|
if archiveHtml.lower().find("archive.org") > 0:
|
|
1860
2263
|
debugText = "ARCHIVE.ORG"
|
|
@@ -1866,16 +2269,27 @@ def processArchiveUrl(url):
|
|
|
1866
2269
|
writerr(
|
|
1867
2270
|
colored(
|
|
1868
2271
|
getSPACER(
|
|
1869
|
-
'"'
|
|
2272
|
+
'[ DBG ] "'
|
|
1870
2273
|
+ fileName
|
|
1871
2274
|
+ '" CONTAINS '
|
|
1872
2275
|
+ debugText
|
|
1873
2276
|
+ " - CHECK ITS A VALID REFERENCE"
|
|
1874
2277
|
),
|
|
1875
2278
|
"yellow",
|
|
2279
|
+
attrs=["dark"],
|
|
1876
2280
|
)
|
|
1877
2281
|
)
|
|
1878
|
-
except Exception:
|
|
2282
|
+
except Exception as e:
|
|
2283
|
+
writerr(
|
|
2284
|
+
colored(
|
|
2285
|
+
'[ DBG ] Error - Failed to output debug info for "'
|
|
2286
|
+
+ archiveUrl
|
|
2287
|
+
+ '": '
|
|
2288
|
+
+ str(e),
|
|
2289
|
+
"red",
|
|
2290
|
+
attrs=["dark"],
|
|
2291
|
+
)
|
|
2292
|
+
)
|
|
1879
2293
|
pass
|
|
1880
2294
|
|
|
1881
2295
|
successCount = successCount + 1
|
|
@@ -2897,17 +3311,38 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2897
3311
|
resp = session.get(
|
|
2898
3312
|
domUrl, headers={"User-Agent": userAgent}, allow_redirects=True
|
|
2899
3313
|
)
|
|
2900
|
-
|
|
3314
|
+
|
|
3315
|
+
# Get raw content bytes first
|
|
3316
|
+
contentBytes = resp.content
|
|
3317
|
+
|
|
3318
|
+
# Get content type from response headers
|
|
3319
|
+
try:
|
|
3320
|
+
contentType = resp.headers.get("Content-Type", "").split(";")[0].lower()
|
|
3321
|
+
except Exception:
|
|
3322
|
+
contentType = ""
|
|
3323
|
+
|
|
3324
|
+
# Determine if this is binary content based on actual content, Content-Type, and URL
|
|
3325
|
+
isBinary = isBinaryContent(contentBytes, contentType, originalUrl)
|
|
3326
|
+
|
|
3327
|
+
if isBinary:
|
|
3328
|
+
# For binary files, use raw bytes as-is
|
|
3329
|
+
archiveContent = contentBytes
|
|
3330
|
+
archiveHtml = None
|
|
3331
|
+
else:
|
|
3332
|
+
# For text files, decode to string
|
|
3333
|
+
archiveHtml = contentBytes.decode("utf-8", errors="replace")
|
|
3334
|
+
archiveContent = None
|
|
2901
3335
|
|
|
2902
3336
|
# If there is a specific URLScan error in the response, raise an exception
|
|
2903
|
-
if archiveHtml.lower().strip() == "not found!":
|
|
3337
|
+
if not isBinary and archiveHtml.lower().strip() == "not found!":
|
|
2904
3338
|
raise WayBackException
|
|
2905
3339
|
|
|
2906
3340
|
# Only create a file if there is a response
|
|
2907
|
-
if len(archiveHtml)
|
|
3341
|
+
responseLength = len(archiveContent) if isBinary else len(archiveHtml)
|
|
3342
|
+
if responseLength != 0:
|
|
2908
3343
|
|
|
2909
|
-
# Add the URL as a comment at the start of the response
|
|
2910
|
-
if args.url_filename:
|
|
3344
|
+
# Add the URL as a comment at the start of the response (text files only)
|
|
3345
|
+
if not isBinary and args.url_filename:
|
|
2911
3346
|
archiveHtml = "/* Original URL: " + originalUrl + " */\n" + archiveHtml
|
|
2912
3347
|
|
|
2913
3348
|
# Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
|
|
@@ -2915,7 +3350,11 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2915
3350
|
fileName = originalUrl.replace("/", "-").replace(":", "")
|
|
2916
3351
|
fileName = fileName[0:254]
|
|
2917
3352
|
else:
|
|
2918
|
-
|
|
3353
|
+
# For binary files, hash the raw bytes; for text, hash the text
|
|
3354
|
+
if isBinary:
|
|
3355
|
+
hashValue = filehash(archiveContent.hex())
|
|
3356
|
+
else:
|
|
3357
|
+
hashValue = filehash(archiveHtml)
|
|
2919
3358
|
fileName = hashValue
|
|
2920
3359
|
|
|
2921
3360
|
# Determine extension of file from the content-type using the mimetypes library
|
|
@@ -2933,7 +3372,7 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2933
3372
|
pass
|
|
2934
3373
|
|
|
2935
3374
|
# If the extension is blank, numeric, longer than 4 characters or not alphanumeric - then set to html if the content ends with HTML tag, otherwise set to unknown
|
|
2936
|
-
if extension == "":
|
|
3375
|
+
if extension == "" and not isBinary:
|
|
2937
3376
|
if (
|
|
2938
3377
|
archiveHtml.lower().strip().endswith("</html>")
|
|
2939
3378
|
or archiveHtml.lower().strip().endswith("</body>")
|
|
@@ -2944,6 +3383,8 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2944
3383
|
extension = "html"
|
|
2945
3384
|
else:
|
|
2946
3385
|
extension = "unknown"
|
|
3386
|
+
elif extension == "" and isBinary:
|
|
3387
|
+
extension = "bin"
|
|
2947
3388
|
|
|
2948
3389
|
fileName = fileName + "." + extension
|
|
2949
3390
|
|
|
@@ -2960,10 +3401,14 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2960
3401
|
+ f"{fileName}"
|
|
2961
3402
|
)
|
|
2962
3403
|
|
|
2963
|
-
# Write the file
|
|
3404
|
+
# Write the file - binary mode for binary files, text mode for text files
|
|
2964
3405
|
try:
|
|
2965
|
-
|
|
2966
|
-
|
|
3406
|
+
if isBinary:
|
|
3407
|
+
responseFile = open(filePath, "wb")
|
|
3408
|
+
responseFile.write(archiveContent)
|
|
3409
|
+
else:
|
|
3410
|
+
responseFile = open(filePath, "w", encoding="utf8")
|
|
3411
|
+
responseFile.write(archiveHtml)
|
|
2967
3412
|
responseFile.close()
|
|
2968
3413
|
fileCount = fileCount + 1
|
|
2969
3414
|
except Exception as e:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: waymore
|
|
3
|
-
Version: 7.
|
|
3
|
+
Version: 7.7
|
|
4
4
|
Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan, VirusTotal & Intelligence X!
|
|
5
5
|
Home-page: https://github.com/xnl-h4ck3r/waymore
|
|
6
6
|
Author: xnl-h4ck3r
|
|
@@ -21,7 +21,7 @@ Dynamic: license-file
|
|
|
21
21
|
|
|
22
22
|
<center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
|
|
23
23
|
|
|
24
|
-
## About - v7.
|
|
24
|
+
## About - v7.7
|
|
25
25
|
|
|
26
26
|
The idea behind **waymore** is to find even more links from the Wayback Machine (plus other sources) than other existing tools.
|
|
27
27
|
|
waymore-7.6/waymore/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "7.6"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|