waymore 7.6__py3-none-any.whl → 7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waymore/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "7.6"
1
+ __version__ = "7.7"
waymore/waymore.py CHANGED
@@ -247,10 +247,10 @@ DEFAULT_LIMIT = 5000
247
247
  DEFAULT_TIMEOUT = 30
248
248
 
249
249
  # Exclusions used to exclude responses we will try to get from web.archive.org
250
- DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource"
250
+ DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource,.wmv,.wma,.asx"
251
251
 
252
252
  # MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API
253
- DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/pdf,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff"
253
+ DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/pdf,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff,video/x-ms-asf,audio/x-ms-wma,audio/wma,application/x-mplayer2"
254
254
 
255
255
  # Response code exclusions we will use to filter links and responses from web.archive.org through their API
256
256
  DEFAULT_FILTER_CODE = "404,301,302"
@@ -297,6 +297,298 @@ INLINE_JS_EXCLUDE = [
297
297
  ".json",
298
298
  ]
299
299
 
300
+ # Binary file extensions that should be saved as raw bytes, not text
301
+ BINARY_EXTENSIONS = frozenset(
302
+ [
303
+ ".zip",
304
+ ".gz",
305
+ ".gzip",
306
+ ".tar",
307
+ ".rar",
308
+ ".7z",
309
+ ".bz2",
310
+ ".xz",
311
+ ".pdf",
312
+ ".doc",
313
+ ".docx",
314
+ ".xls",
315
+ ".xlsx",
316
+ ".ppt",
317
+ ".pptx",
318
+ ".exe",
319
+ ".msi",
320
+ ".dll",
321
+ ".bin",
322
+ ".so",
323
+ ".dmg",
324
+ ".deb",
325
+ ".rpm",
326
+ ".png",
327
+ ".jpg",
328
+ ".jpeg",
329
+ ".gif",
330
+ ".bmp",
331
+ ".ico",
332
+ ".webp",
333
+ ".svg",
334
+ ".tiff",
335
+ ".tif",
336
+ ".mp3",
337
+ ".mp4",
338
+ ".wav",
339
+ ".avi",
340
+ ".mov",
341
+ ".mkv",
342
+ ".flv",
343
+ ".wmv",
344
+ ".webm",
345
+ ".ogg",
346
+ ".ttf",
347
+ ".otf",
348
+ ".woff",
349
+ ".woff2",
350
+ ".eot",
351
+ ".class",
352
+ ".jar",
353
+ ".war",
354
+ ".ear",
355
+ ".pyc",
356
+ ".pyo",
357
+ ".o",
358
+ ".a",
359
+ ".lib",
360
+ ".iso",
361
+ ".img",
362
+ ".sqlite",
363
+ ".db",
364
+ ".mdb",
365
+ ".swf",
366
+ ".fla",
367
+ ]
368
+ )
369
+
370
+ # Binary MIME types that should be saved as raw bytes, not text
371
+ BINARY_MIME_TYPES = frozenset(
372
+ [
373
+ "application/zip",
374
+ "application/x-zip-compressed",
375
+ "application/x-gzip",
376
+ "application/gzip",
377
+ "application/x-tar",
378
+ "application/x-rar-compressed",
379
+ "application/x-7z-compressed",
380
+ "application/x-bzip2",
381
+ "application/x-xz",
382
+ "application/pdf",
383
+ "application/msword",
384
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
385
+ "application/vnd.ms-excel",
386
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
387
+ "application/vnd.ms-powerpoint",
388
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
389
+ "application/x-msdownload",
390
+ "application/x-msi",
391
+ "application/x-dosexec",
392
+ "application/octet-stream",
393
+ "image/png",
394
+ "image/jpeg",
395
+ "image/gif",
396
+ "image/bmp",
397
+ "image/x-icon",
398
+ "image/webp",
399
+ "image/tiff",
400
+ "audio/mpeg",
401
+ "audio/wav",
402
+ "audio/ogg",
403
+ "audio/webm",
404
+ "video/mp4",
405
+ "video/avi",
406
+ "video/quicktime",
407
+ "video/x-msvideo",
408
+ "video/x-matroska",
409
+ "video/webm",
410
+ "video/ogg",
411
+ "font/ttf",
412
+ "font/otf",
413
+ "font/woff",
414
+ "font/woff2",
415
+ "application/x-font-ttf",
416
+ "application/x-font-otf",
417
+ "application/font-woff",
418
+ "application/font-woff2",
419
+ "application/java-archive",
420
+ "application/x-java-class",
421
+ "application/x-shockwave-flash",
422
+ "application/x-sqlite3",
423
+ "application/x-iso9660-image",
424
+ ]
425
+ )
426
+
427
+
428
+ def isBinaryContent(contentBytes, contentType, url=""):
429
+ """
430
+ Determine if content should be treated as binary based on actual content, Content-Type, and URL.
431
+
432
+ Priority (highest to lowest):
433
+ 1. Content inspection - check for text signatures (most reliable)
434
+ 2. Content-Type header
435
+ 3. URL extension (least reliable - archive might have captured an HTML error page)
436
+
437
+ Args:
438
+ contentBytes: The raw response bytes (at least first 100 bytes)
439
+ contentType: The Content-Type header value
440
+ url: The URL (optional, used as fallback)
441
+
442
+ Returns True if content is binary and should be saved as raw bytes.
443
+ """
444
+ # STEP 1: Check actual content for text signatures (most reliable)
445
+ # If content starts with text markers, it's definitely NOT binary regardless of extension
446
+ try:
447
+ if contentBytes and len(contentBytes) > 0:
448
+ # Get first 100 bytes and strip leading whitespace/newlines for checking
449
+ preview = contentBytes[:100].lstrip()
450
+ previewLower = preview.lower()
451
+
452
+ # Common text file signatures
453
+ textSignatures = [
454
+ b"<!doctype", # HTML doctype
455
+ b"<html", # HTML tag
456
+ b"<head", # HTML head
457
+ b"<body", # HTML body
458
+ b"<?xml", # XML declaration
459
+ b"<svg", # SVG image (actually XML text)
460
+ b"{", # JSON object
461
+ b"[", # JSON array
462
+ b"/*", # CSS/JS comment
463
+ b"//", # JS comment
464
+ b"#!", # Shebang (shell scripts)
465
+ b"var ", # JavaScript
466
+ b"let ", # JavaScript
467
+ b"const ", # JavaScript
468
+ b"function", # JavaScript
469
+ b"import ", # JavaScript/Python
470
+ b"export ", # JavaScript
471
+ b"class ", # Various languages
472
+ b"def ", # Python
473
+ ]
474
+
475
+ for sig in textSignatures:
476
+ if previewLower.startswith(sig):
477
+ return False # Definitely text, not binary
478
+
479
+ # Check for binary file magic bytes (file signatures)
480
+ binarySignatures = [
481
+ b"%PDF", # PDF
482
+ b"PK\x03\x04", # ZIP, DOCX, XLSX, etc.
483
+ b"PK\x05\x06", # Empty ZIP
484
+ b"\x1f\x8b", # GZIP
485
+ b"\x89PNG", # PNG
486
+ b"\xff\xd8\xff", # JPEG
487
+ b"GIF87a", # GIF
488
+ b"GIF89a", # GIF
489
+ b"BM", # BMP (check at start)
490
+ b"RIFF", # WAV, AVI, WebP
491
+ b"\x00\x00\x00", # Various binary formats (MP4, etc.)
492
+ b"ID3", # MP3 with ID3 tag
493
+ b"\xff\xfb", # MP3
494
+ b"\xff\xfa", # MP3
495
+ b"OggS", # OGG
496
+ b"\x4d\x5a", # EXE/DLL (MZ header)
497
+ b"\x7fELF", # Linux ELF binary
498
+ b"\xca\xfe\xba\xbe", # Java class file
499
+ b"\x30\x26\xb2\x75", # ASF/WMV/WMA (first 4 bytes of ASF GUID)
500
+ b"FLV\x01", # FLV (Flash Video)
501
+ b"ftyp", # MP4/M4A/MOV (after 4 byte size prefix)
502
+ b"Rar!\x1a\x07", # RAR archive
503
+ b"7z\xbc\xaf\x27\x1c", # 7-Zip archive
504
+ b"\x1a\x45\xdf\xa3", # WebM/MKV (EBML)
505
+ b"II\x2a\x00", # TIFF (Intel byte order)
506
+ b"MM\x00\x2a", # TIFF (Motorola byte order)
507
+ b"\x00\x00\x01\x00", # ICO (Windows Icon)
508
+ b"\x00\x00\x02\x00", # CUR (Windows Cursor)
509
+ b"wOFF", # WOFF font
510
+ b"wOF2", # WOFF2 font
511
+ b"FWS", # SWF (uncompressed Flash)
512
+ b"CWS", # SWF (zlib compressed Flash)
513
+ b"ZWS", # SWF (LZMA compressed Flash)
514
+ b"\x00\x01\x00\x00", # TrueType font
515
+ b"OTTO", # OpenType font with CFF
516
+ ]
517
+
518
+ for sig in binarySignatures:
519
+ if preview.startswith(sig):
520
+ return True # Definitely binary
521
+ except Exception:
522
+ pass
523
+
524
+ # STEP 2: Check Content-Type header
525
+ try:
526
+ if contentType:
527
+ mimeType = contentType.lower().split(";")[0].strip()
528
+
529
+ # Explicit text types
530
+ textMimeTypes = [
531
+ "text/html",
532
+ "text/plain",
533
+ "text/css",
534
+ "text/javascript",
535
+ "text/xml",
536
+ "text/csv",
537
+ "text/markdown",
538
+ "application/json",
539
+ "application/javascript",
540
+ "application/xml",
541
+ "application/xhtml+xml",
542
+ "application/rss+xml",
543
+ "application/atom+xml",
544
+ ]
545
+ if mimeType in textMimeTypes or mimeType.startswith("text/"):
546
+ return False # Text type
547
+
548
+ # Known binary types
549
+ if mimeType in BINARY_MIME_TYPES:
550
+ return True
551
+
552
+ # Generic binary prefixes
553
+ if (
554
+ mimeType.startswith("image/")
555
+ or mimeType.startswith("audio/")
556
+ or mimeType.startswith("video/")
557
+ ):
558
+ return True
559
+ if mimeType.startswith("application/") and mimeType not in textMimeTypes:
560
+ # application/* is often binary, but not always - be conservative
561
+ if "octet-stream" in mimeType or "binary" in mimeType:
562
+ return True
563
+ except Exception:
564
+ pass
565
+
566
+ # STEP 3: Check URL extension as last resort
567
+ try:
568
+ if url:
569
+ # Extract actual URL from prefixed formats (Wayback/URLScan)
570
+ actualUrl = url
571
+ httpPos = url.find("http://")
572
+ httpsPos = url.find("https://")
573
+ if httpsPos >= 0:
574
+ actualUrl = url[httpsPos:]
575
+ elif httpPos >= 0:
576
+ actualUrl = url[httpPos:]
577
+
578
+ parsed = urlparse(actualUrl.strip())
579
+ path = parsed.path.lower()
580
+ if "." in path:
581
+ ext = "." + path.rsplit(".", 1)[-1]
582
+ if "?" in ext:
583
+ ext = ext.split("?")[0]
584
+ if ext in BINARY_EXTENSIONS:
585
+ return True
586
+ except Exception:
587
+ pass
588
+
589
+ # Default: treat as text (safer - text processing won't corrupt text)
590
+ return False
591
+
300
592
 
301
593
  # Get memory usage for
302
594
  def getMemory():
@@ -955,16 +1247,12 @@ def showOptions():
955
1247
  )
956
1248
  )
957
1249
 
1250
+ # Only show --source-ip if it's explicitly configured
958
1251
  if SOURCE_IP:
959
1252
  write(
960
1253
  colored("--source-ip: " + str(SOURCE_IP), "magenta")
961
1254
  + colored(" Outbound requests will bind to this IP.", "white")
962
1255
  )
963
- else:
964
- write(
965
- colored("--source-ip: default", "magenta")
966
- + colored(" Outbound IP determined by OS routing table.", "white")
967
- )
968
1256
 
969
1257
  write()
970
1258
 
@@ -1497,6 +1785,63 @@ def fixArchiveOrgUrl(url):
1497
1785
  return url
1498
1786
 
1499
1787
 
1788
+ def isLikelyBinaryUrl(url):
1789
+ """
1790
+ Check if a URL likely points to a binary file based on its extension.
1791
+ This is used BEFORE making a request to decide if we need the raw/id_ version.
1792
+ """
1793
+ try:
1794
+ # Extract actual URL from prefixed formats (Wayback timestamp/URLScan UUID)
1795
+ actualUrl = url
1796
+ httpPos = url.find("http://")
1797
+ httpsPos = url.find("https://")
1798
+ if httpsPos >= 0:
1799
+ actualUrl = url[httpsPos:]
1800
+ elif httpPos >= 0:
1801
+ actualUrl = url[httpPos:]
1802
+
1803
+ parsed = urlparse(actualUrl.strip())
1804
+ path = parsed.path.lower()
1805
+ if "." in path:
1806
+ ext = "." + path.rsplit(".", 1)[-1]
1807
+ if "?" in ext:
1808
+ ext = ext.split("?")[0]
1809
+ if ext in BINARY_EXTENSIONS:
1810
+ return True
1811
+ except Exception:
1812
+ pass
1813
+ return False
1814
+
1815
+
1816
+ def addRawModifier(archiveUrl):
1817
+ """
1818
+ Add 'id_' modifier to Wayback Machine URL to get raw/original content.
1819
+ This is essential for binary files to avoid Wayback modifications.
1820
+
1821
+ Example:
1822
+ Input: https://web.archive.org/web/20090315210455/http://example.com/file.wmv
1823
+ Output: https://web.archive.org/web/20090315210455id_/http://example.com/file.wmv
1824
+ """
1825
+ try:
1826
+ # Find the timestamp in the URL (14 digits after /web/)
1827
+ webPos = archiveUrl.find("/web/")
1828
+ if webPos >= 0:
1829
+ # Find where the timestamp ends (first / after /web/)
1830
+ afterWeb = webPos + 5 # Position after "/web/"
1831
+ slashAfterTimestamp = archiveUrl.find("/", afterWeb)
1832
+ if slashAfterTimestamp > afterWeb:
1833
+ # Insert id_ before the slash after timestamp
1834
+ timestamp = archiveUrl[afterWeb:slashAfterTimestamp]
1835
+ # Only add id_ if it's not already there
1836
+ if not timestamp.endswith("id_"):
1837
+ return (
1838
+ archiveUrl[:slashAfterTimestamp] + "id_" + archiveUrl[slashAfterTimestamp:]
1839
+ )
1840
+ except Exception:
1841
+ pass
1842
+ return archiveUrl
1843
+
1844
+
1500
1845
  # Add a link to the linksFound collection for archived responses (included timestamp preifx)
1501
1846
  def linksFoundResponseAdd(link):
1502
1847
  global linksFound, argsInput, argsInputHostname, links_lock
@@ -1581,6 +1926,12 @@ def processArchiveUrl(url):
1581
1926
  if stopProgram is None:
1582
1927
 
1583
1928
  archiveUrl = "https://web.archive.org/web/" + fixArchiveOrgUrl(url)
1929
+
1930
+ # For binary files, add id_ modifier to get raw/original content
1931
+ # This prevents Wayback Machine from modifying the content
1932
+ if isLikelyBinaryUrl(url):
1933
+ archiveUrl = addRawModifier(archiveUrl)
1934
+
1584
1935
  hashValue = ""
1585
1936
 
1586
1937
  # Get memory usage every 100 responses
@@ -1593,6 +1944,18 @@ def processArchiveUrl(url):
1593
1944
  # Make a request to the web archive
1594
1945
  try:
1595
1946
  try:
1947
+ try:
1948
+ if os.environ.get("USER") == "xnl":
1949
+ writerr(
1950
+ colored(
1951
+ "[ DBG ] Requesting file " + archiveUrl,
1952
+ "yellow",
1953
+ attrs=["dark"],
1954
+ )
1955
+ )
1956
+ except Exception:
1957
+ pass
1958
+
1596
1959
  # Choose a random user agent string to use for any requests
1597
1960
  userAgent = random.choice(USER_AGENT)
1598
1961
 
@@ -1604,146 +1967,175 @@ def processArchiveUrl(url):
1604
1967
  headers={"User-Agent": userAgent},
1605
1968
  allow_redirects=True,
1606
1969
  )
1607
- archiveHtml = str(resp.text)
1970
+
1971
+ # Get raw content bytes first
1972
+ contentBytes = resp.content
1973
+
1608
1974
  try:
1609
- contentType = resp.headers.get("Content-Type").split(";")[0].lower()
1975
+ contentType = resp.headers.get("Content-Type", "").split(";")[0].lower()
1610
1976
  except Exception:
1611
1977
  contentType = ""
1612
1978
 
1979
+ # Determine if this is binary content based on actual content, Content-Type, and URL
1980
+ isBinary = isBinaryContent(contentBytes, contentType, url)
1981
+
1982
+ if isBinary:
1983
+ # For binary files, use raw bytes as-is
1984
+ archiveContent = contentBytes
1985
+ archiveHtml = None # Not used for binary files
1986
+ else:
1987
+ # For text files, decode to string
1988
+ archiveHtml = contentBytes.decode("utf-8", errors="replace")
1989
+ archiveContent = None # Not used for text files
1990
+
1613
1991
  # Only create a file if there is a response
1614
- if len(archiveHtml) != 0:
1992
+ responseLength = len(archiveContent) if isBinary else len(archiveHtml)
1993
+ if responseLength != 0:
1615
1994
 
1995
+ # For text files, check for custom 404 pages
1616
1996
  # If the FILTER_CODE doesn't include 404, OR
1617
1997
  # If the FILTER_CODE includes 404, and it doesn't seem to be a custom 404 page
1618
- if "404" not in FILTER_CODE or (
1619
- "404" in FILTER_CODE
1620
- and not re.findall(REGEX_404, archiveHtml, re.DOTALL | re.IGNORECASE)
1621
- ):
1622
-
1623
- # Add the URL as a comment at the start of the response
1624
- if args.url_filename:
1625
- archiveHtml = (
1626
- "/* Original URL: " + archiveUrl + " */\n" + archiveHtml
1998
+ if (
1999
+ isBinary
2000
+ or "404" not in FILTER_CODE
2001
+ or (
2002
+ "404" in FILTER_CODE
2003
+ and not re.findall(
2004
+ REGEX_404, archiveHtml, re.DOTALL | re.IGNORECASE
1627
2005
  )
1628
-
1629
- # Remove all web archive references in the response
1630
- archiveHtml = re.sub(
1631
- r'\<script type=\"text\/javascript" src=\"\/_static\/js\/bundle-playback\.js\?v=[A-Za-z0-9]*" charset="utf-8"><\/script>\n<script type="text\/javascript" src="\/_static\/js\/wombat\.js.*\<\!-- End Wayback Rewrite JS Include --\>',
1632
- "",
1633
- archiveHtml,
1634
- 1,
1635
- flags=re.DOTALL | re.IGNORECASE,
1636
- )
1637
- archiveHtml = re.sub(
1638
- r"\<script src=\"\/\/archive\.org.*\<\!-- End Wayback Rewrite JS Include --\>",
1639
- "",
1640
- archiveHtml,
1641
- 1,
1642
- flags=re.DOTALL | re.IGNORECASE,
1643
- )
1644
- archiveHtml = re.sub(
1645
- r"\<script\>window\.RufflePlayer[^\<]*\<\/script\>",
1646
- "",
1647
- archiveHtml,
1648
- 1,
1649
- flags=re.DOTALL | re.IGNORECASE,
1650
- )
1651
- archiveHtml = re.sub(
1652
- r"\<\!-- BEGIN WAYBACK TOOLBAR INSERT --\>.*\<\!-- END WAYBACK TOOLBAR INSERT --\>",
1653
- "",
1654
- archiveHtml,
1655
- 1,
1656
- flags=re.DOTALL | re.IGNORECASE,
1657
- )
1658
- archiveHtml = re.sub(
1659
- r"(}\n)?(\/\*|<!--\n)\s*FILE ARCHIVED ON.*108\(a\)\(3\)\)\.\n(\*\/|-->)",
1660
- "",
1661
- archiveHtml,
1662
- 1,
1663
- flags=re.DOTALL | re.IGNORECASE,
1664
- )
1665
- archiveHtml = re.sub(
1666
- r"var\s_____WB\$wombat\$assign\$function.*WB\$wombat\$assign\$function_____\(\"opener\"\);",
1667
- "",
1668
- archiveHtml,
1669
- 1,
1670
- flags=re.DOTALL | re.IGNORECASE,
1671
- )
1672
- archiveHtml = re.sub(
1673
- r"(\<\!--|\/\*)\nplayback timings.*(--\>|\*\/)",
1674
- "",
1675
- archiveHtml,
1676
- 1,
1677
- flags=re.DOTALL | re.IGNORECASE,
1678
- )
1679
- archiveHtml = re.sub(
1680
- r"((https:)?\/\/web\.archive\.org)?\/web\/[0-9]{14}([A-Za-z]{2}\_)?\/",
1681
- "",
1682
- archiveHtml,
1683
- flags=re.IGNORECASE,
1684
- )
1685
- archiveHtml = re.sub(
1686
- r"((https:)?\\\/\\\/web\.archive\.org)?\\\/web\\\/[0-9]{14}([A-Za-z]{2}\_)?\\\/",
1687
- "",
1688
- archiveHtml,
1689
- flags=re.IGNORECASE,
1690
- )
1691
- archiveHtml = re.sub(
1692
- r"((https:)?%2F%2Fweb\.archive\.org)?%2Fweb%2F[0-9]{14}([A-Za-z]{2}\_)?%2F",
1693
- "",
1694
- archiveHtml,
1695
- flags=re.IGNORECASE,
1696
- )
1697
- archiveHtml = re.sub(
1698
- r"((https:)?\\u002F\\u002Fweb\.archive\.org)?\\u002Fweb\\u002F[0-9]{14}([A-Za-z]{2}\_)?\\u002F",
1699
- "",
1700
- archiveHtml,
1701
- flags=re.IGNORECASE,
1702
- )
1703
- archiveHtml = re.sub(
1704
- r"\<script type=\"text\/javascript\">\s*__wm\.init\(\"https:\/\/web\.archive\.org\/web\"\);[^\<]*\<\/script\>",
1705
- "",
1706
- archiveHtml,
1707
- flags=re.IGNORECASE,
1708
- )
1709
- archiveHtml = re.sub(
1710
- r'\<script type=\"text\/javascript\" src="https:\/\/web-static\.archive\.org[^\<]*\<\/script\>',
1711
- "",
1712
- archiveHtml,
1713
- flags=re.IGNORECASE,
1714
- )
1715
- archiveHtml = re.sub(
1716
- r"\<link rel=\"stylesheet\" type=\"text\/css\" href=\"https:\/\/web-static\.archive\.org[^\<]*\/\>",
1717
- "",
1718
- archiveHtml,
1719
- flags=re.IGNORECASE,
1720
- )
1721
- archiveHtml = re.sub(
1722
- r"\<\!-- End Wayback Rewrite JS Include --\>",
1723
- "",
1724
- archiveHtml,
1725
- re.IGNORECASE,
1726
2006
  )
2007
+ ):
1727
2008
 
1728
- # If there is a specific Wayback error in the response, raise an exception
1729
- if (
1730
- archiveHtml.lower().find(
1731
- "wayback machine has not archived that url"
2009
+ # For text files only: Add URL comment and clean up wayback references
2010
+ if not isBinary:
2011
+ # Add the URL as a comment at the start of the response
2012
+ if args.url_filename:
2013
+ archiveHtml = (
2014
+ "/* Original URL: " + archiveUrl + " */\n" + archiveHtml
2015
+ )
2016
+
2017
+ # Remove all web archive references in the response
2018
+ archiveHtml = re.sub(
2019
+ r'\<script type=\"text\/javascript" src=\"\/_static\/js\/bundle-playback\.js\?v=[A-Za-z0-9]*" charset="utf-8"\><\/script>\n<script type="text\/javascript" src="\/_static\/js\/wombat\.js.*\<\!-- End Wayback Rewrite JS Include --\>',
2020
+ "",
2021
+ archiveHtml,
2022
+ 1,
2023
+ flags=re.DOTALL | re.IGNORECASE,
1732
2024
  )
1733
- > 0
1734
- or archiveHtml.lower().find(
1735
- "snapshot cannot be displayed due to an internal error"
2025
+ archiveHtml = re.sub(
2026
+ r"\<script src=\"\/\/archive\.org.*\<\!-- End Wayback Rewrite JS Include --\>",
2027
+ "",
2028
+ archiveHtml,
2029
+ 1,
2030
+ flags=re.DOTALL | re.IGNORECASE,
2031
+ )
2032
+ archiveHtml = re.sub(
2033
+ r"\<script\>window\.RufflePlayer[^\<]*\<\/script\>",
2034
+ "",
2035
+ archiveHtml,
2036
+ 1,
2037
+ flags=re.DOTALL | re.IGNORECASE,
2038
+ )
2039
+ archiveHtml = re.sub(
2040
+ r"\<\!-- BEGIN WAYBACK TOOLBAR INSERT --\>.*\<\!-- END WAYBACK TOOLBAR INSERT --\>",
2041
+ "",
2042
+ archiveHtml,
2043
+ 1,
2044
+ flags=re.DOTALL | re.IGNORECASE,
2045
+ )
2046
+ archiveHtml = re.sub(
2047
+ r"(}\n)?(\/\*|<\!--\n)\s*FILE ARCHIVED ON.*108\(a\)\(3\)\)\.\n(\*\/|--\>)",
2048
+ "",
2049
+ archiveHtml,
2050
+ 1,
2051
+ flags=re.DOTALL | re.IGNORECASE,
2052
+ )
2053
+ archiveHtml = re.sub(
2054
+ r"var\s_____WB\$wombat\$assign\$function.*WB\$wombat\$assign\$function_____\(\"opener\"\);",
2055
+ "",
2056
+ archiveHtml,
2057
+ 1,
2058
+ flags=re.DOTALL | re.IGNORECASE,
2059
+ )
2060
+ archiveHtml = re.sub(
2061
+ r"(\<\!--|\/\*)\nplayback timings.*(--\>|\*\/)",
2062
+ "",
2063
+ archiveHtml,
2064
+ 1,
2065
+ flags=re.DOTALL | re.IGNORECASE,
2066
+ )
2067
+ archiveHtml = re.sub(
2068
+ r"((https:)?\/\/web\.archive\.org)?\/web\/[0-9]{14}([A-Za-z]{2}\_)?\/",
2069
+ "",
2070
+ archiveHtml,
2071
+ flags=re.IGNORECASE,
2072
+ )
2073
+ archiveHtml = re.sub(
2074
+ r"((https:)?\\\/\\\/web\.archive\.org)?\\\/web\\\/[0-9]{14}([A-Za-z]{2}\_)?\\\/",
2075
+ "",
2076
+ archiveHtml,
2077
+ flags=re.IGNORECASE,
2078
+ )
2079
+ archiveHtml = re.sub(
2080
+ r"((https:)?%2F%2Fweb\.archive\.org)?%2Fweb%2F[0-9]{14}([A-Za-z]{2}\_)?%2F",
2081
+ "",
2082
+ archiveHtml,
2083
+ flags=re.IGNORECASE,
2084
+ )
2085
+ archiveHtml = re.sub(
2086
+ r"((https:)?\\u002F\\u002Fweb\.archive\.org)?\\u002Fweb\\u002F[0-9]{14}([A-Za-z]{2}\_)?\\u002F",
2087
+ "",
2088
+ archiveHtml,
2089
+ flags=re.IGNORECASE,
2090
+ )
2091
+ archiveHtml = re.sub(
2092
+ r"\<script type=\"text\/javascript\"\>\s*__wm\.init\(\"https:\/\/web\.archive\.org\/web\"\);[^\<]*\<\/script\>",
2093
+ "",
2094
+ archiveHtml,
2095
+ flags=re.IGNORECASE,
2096
+ )
2097
+ archiveHtml = re.sub(
2098
+ r'\<script type=\"text\/javascript\" src="https:\/\/web-static\.archive\.org[^\<]*\<\/script\>',
2099
+ "",
2100
+ archiveHtml,
2101
+ flags=re.IGNORECASE,
2102
+ )
2103
+ archiveHtml = re.sub(
2104
+ r"\<link rel=\"stylesheet\" type=\"text\/css\" href=\"https:\/\/web-static\.archive\.org[^\<]*\/\>",
2105
+ "",
2106
+ archiveHtml,
2107
+ flags=re.IGNORECASE,
2108
+ )
2109
+ archiveHtml = re.sub(
2110
+ r"\<\!-- End Wayback Rewrite JS Include --\>",
2111
+ "",
2112
+ archiveHtml,
2113
+ re.IGNORECASE,
1736
2114
  )
1737
- > 0
1738
- ):
1739
- raise WayBackException
2115
+
2116
+ # If there is a specific Wayback error in the response, raise an exception
2117
+ if (
2118
+ archiveHtml.lower().find(
2119
+ "wayback machine has not archived that url"
2120
+ )
2121
+ > 0
2122
+ or archiveHtml.lower().find(
2123
+ "snapshot cannot be displayed due to an internal error"
2124
+ )
2125
+ > 0
2126
+ ):
2127
+ raise WayBackException
1740
2128
 
1741
2129
  # Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
1742
2130
  if args.url_filename:
1743
2131
  fileName = url.replace("/", "-").replace(":", "")
1744
2132
  fileName = fileName[0:254]
1745
2133
  else:
1746
- hashValue = filehash(archiveHtml)
2134
+ # For binary files, hash the raw bytes; for text, hash the text
2135
+ if isBinary:
2136
+ hashValue = filehash(archiveContent.hex())
2137
+ else:
2138
+ hashValue = filehash(archiveHtml)
1747
2139
  fileName = hashValue
1748
2140
 
1749
2141
  # Determine extension of file from the content-type using the mimetypes library
@@ -1785,11 +2177,15 @@ def processArchiveUrl(url):
1785
2177
  extension = "css"
1786
2178
  elif "pdf" in extension:
1787
2179
  extension = "pdf"
2180
+ elif "zip" in extension:
2181
+ extension = "zip"
2182
+ elif "gzip" in extension or "x-gzip" in extension:
2183
+ extension = "gz"
1788
2184
  elif "plain" == extension:
1789
2185
  extension = "txt"
1790
2186
 
1791
2187
  # If extension is still blank, set to html if the content ends with HTML tag, otherwise set to unknown
1792
- if extension == "":
2188
+ if extension == "" and not isBinary:
1793
2189
  if (
1794
2190
  archiveHtml.lower().strip().endswith("</html>")
1795
2191
  or archiveHtml.lower()
@@ -1800,6 +2196,8 @@ def processArchiveUrl(url):
1800
2196
  extension = "html"
1801
2197
  else:
1802
2198
  extension = "unknown"
2199
+ elif extension == "" and isBinary:
2200
+ extension = "bin"
1803
2201
 
1804
2202
  fileName = fileName + "." + extension
1805
2203
 
@@ -1816,10 +2214,14 @@ def processArchiveUrl(url):
1816
2214
  + f"{fileName}"
1817
2215
  )
1818
2216
 
1819
- # Write the file
2217
+ # Write the file - binary mode for binary files, text mode for text files
1820
2218
  try:
1821
- responseFile = open(filePath, "w", encoding="utf8")
1822
- responseFile.write(archiveHtml)
2219
+ if isBinary:
2220
+ responseFile = open(filePath, "wb")
2221
+ responseFile.write(archiveContent)
2222
+ else:
2223
+ responseFile = open(filePath, "w", encoding="utf8")
2224
+ responseFile.write(archiveHtml)
1823
2225
  responseFile.close()
1824
2226
  fileCount = fileCount + 1
1825
2227
  except Exception as e:
@@ -1852,9 +2254,10 @@ def processArchiveUrl(url):
1852
2254
  )
1853
2255
  )
1854
2256
 
1855
- # FOR DEBUGGING PURPOSES
2257
+ # FOR DEBUGGING PURPOSES (only for text files)
1856
2258
  try:
1857
- if os.environ.get("USER") == "xnl":
2259
+ if os.environ.get("USER") == "xnl" and not isBinary:
2260
+
1858
2261
  debugText = ""
1859
2262
  if archiveHtml.lower().find("archive.org") > 0:
1860
2263
  debugText = "ARCHIVE.ORG"
@@ -1866,16 +2269,27 @@ def processArchiveUrl(url):
1866
2269
  writerr(
1867
2270
  colored(
1868
2271
  getSPACER(
1869
- '"'
2272
+ '[ DBG ] "'
1870
2273
  + fileName
1871
2274
  + '" CONTAINS '
1872
2275
  + debugText
1873
2276
  + " - CHECK ITS A VALID REFERENCE"
1874
2277
  ),
1875
2278
  "yellow",
2279
+ attrs=["dark"],
1876
2280
  )
1877
2281
  )
1878
- except Exception:
2282
+ except Exception as e:
2283
+ writerr(
2284
+ colored(
2285
+ '[ DBG ] Error - Failed to output debug info for "'
2286
+ + archiveUrl
2287
+ + '": '
2288
+ + str(e),
2289
+ "red",
2290
+ attrs=["dark"],
2291
+ )
2292
+ )
1879
2293
  pass
1880
2294
 
1881
2295
  successCount = successCount + 1
@@ -2897,17 +3311,38 @@ def getURLScanDOM(originalUrl, domUrl):
2897
3311
  resp = session.get(
2898
3312
  domUrl, headers={"User-Agent": userAgent}, allow_redirects=True
2899
3313
  )
2900
- archiveHtml = str(resp.text)
3314
+
3315
+ # Get raw content bytes first
3316
+ contentBytes = resp.content
3317
+
3318
+ # Get content type from response headers
3319
+ try:
3320
+ contentType = resp.headers.get("Content-Type", "").split(";")[0].lower()
3321
+ except Exception:
3322
+ contentType = ""
3323
+
3324
+ # Determine if this is binary content based on actual content, Content-Type, and URL
3325
+ isBinary = isBinaryContent(contentBytes, contentType, originalUrl)
3326
+
3327
+ if isBinary:
3328
+ # For binary files, use raw bytes as-is
3329
+ archiveContent = contentBytes
3330
+ archiveHtml = None
3331
+ else:
3332
+ # For text files, decode to string
3333
+ archiveHtml = contentBytes.decode("utf-8", errors="replace")
3334
+ archiveContent = None
2901
3335
 
2902
3336
  # If there is a specific URLScan error in the response, raise an exception
2903
- if archiveHtml.lower().strip() == "not found!":
3337
+ if not isBinary and archiveHtml.lower().strip() == "not found!":
2904
3338
  raise WayBackException
2905
3339
 
2906
3340
  # Only create a file if there is a response
2907
- if len(archiveHtml) != 0:
3341
+ responseLength = len(archiveContent) if isBinary else len(archiveHtml)
3342
+ if responseLength != 0:
2908
3343
 
2909
- # Add the URL as a comment at the start of the response
2910
- if args.url_filename:
3344
+ # Add the URL as a comment at the start of the response (text files only)
3345
+ if not isBinary and args.url_filename:
2911
3346
  archiveHtml = "/* Original URL: " + originalUrl + " */\n" + archiveHtml
2912
3347
 
2913
3348
  # Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
@@ -2915,7 +3350,11 @@ def getURLScanDOM(originalUrl, domUrl):
2915
3350
  fileName = originalUrl.replace("/", "-").replace(":", "")
2916
3351
  fileName = fileName[0:254]
2917
3352
  else:
2918
- hashValue = filehash(archiveHtml)
3353
+ # For binary files, hash the raw bytes; for text, hash the text
3354
+ if isBinary:
3355
+ hashValue = filehash(archiveContent.hex())
3356
+ else:
3357
+ hashValue = filehash(archiveHtml)
2919
3358
  fileName = hashValue
2920
3359
 
2921
3360
  # Determine extension of file from the content-type using the mimetypes library
@@ -2933,7 +3372,7 @@ def getURLScanDOM(originalUrl, domUrl):
2933
3372
  pass
2934
3373
 
2935
3374
  # If the extension is blank, numeric, longer than 4 characters or not alphanumeric - then set to html if the content ends with HTML tag, otherwise set to unknown
2936
- if extension == "":
3375
+ if extension == "" and not isBinary:
2937
3376
  if (
2938
3377
  archiveHtml.lower().strip().endswith("</html>")
2939
3378
  or archiveHtml.lower().strip().endswith("</body>")
@@ -2944,6 +3383,8 @@ def getURLScanDOM(originalUrl, domUrl):
2944
3383
  extension = "html"
2945
3384
  else:
2946
3385
  extension = "unknown"
3386
+ elif extension == "" and isBinary:
3387
+ extension = "bin"
2947
3388
 
2948
3389
  fileName = fileName + "." + extension
2949
3390
 
@@ -2960,10 +3401,14 @@ def getURLScanDOM(originalUrl, domUrl):
2960
3401
  + f"{fileName}"
2961
3402
  )
2962
3403
 
2963
- # Write the file
3404
+ # Write the file - binary mode for binary files, text mode for text files
2964
3405
  try:
2965
- responseFile = open(filePath, "w", encoding="utf8")
2966
- responseFile.write(archiveHtml)
3406
+ if isBinary:
3407
+ responseFile = open(filePath, "wb")
3408
+ responseFile.write(archiveContent)
3409
+ else:
3410
+ responseFile = open(filePath, "w", encoding="utf8")
3411
+ responseFile.write(archiveHtml)
2967
3412
  responseFile.close()
2968
3413
  fileCount = fileCount + 1
2969
3414
  except Exception as e:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: waymore
3
- Version: 7.6
3
+ Version: 7.7
4
4
  Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan, VirusTotal & Intelligence X!
5
5
  Home-page: https://github.com/xnl-h4ck3r/waymore
6
6
  Author: xnl-h4ck3r
@@ -21,7 +21,7 @@ Dynamic: license-file
21
21
 
22
22
  <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
23
23
 
24
- ## About - v7.6
24
+ ## About - v7.7
25
25
 
26
26
  The idea behind **waymore** is to find even more links from the Wayback Machine (plus other sources) than other existing tools.
27
27
 
@@ -0,0 +1,8 @@
1
+ waymore/__init__.py,sha256=FhVZ4Gv-sfTtDVYJPqfcEiZlqIYicpAUZojeZ5s9NfE,21
2
+ waymore/waymore.py,sha256=nMWRNwva3fWUiuX_UsbHbk_xd0-4Sp8W6i3oJqXciJw,309139
3
+ waymore-7.7.dist-info/licenses/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
4
+ waymore-7.7.dist-info/METADATA,sha256=40oJ2jrPkmpUxAs7FslXDaPpZ7fOd-Knj_GtCcSnIx0,53457
5
+ waymore-7.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ waymore-7.7.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
7
+ waymore-7.7.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
8
+ waymore-7.7.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- waymore/__init__.py,sha256=aH35DnqFAFh0wObYdVN2uNFbgLs1105jFtAsq-lrUFE,21
2
- waymore/waymore.py,sha256=6MrZIoVV7qn7V5kNMA5hiCqPcHEaxbLOfqdlOUrVPT0,291719
3
- waymore-7.6.dist-info/licenses/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
4
- waymore-7.6.dist-info/METADATA,sha256=QykFQ6yDdHFXhSTRlOp7pb49Zvl7-e5xLAN8FbcrUDo,53457
5
- waymore-7.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- waymore-7.6.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
7
- waymore-7.6.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
8
- waymore-7.6.dist-info/RECORD,,
File without changes