jseye 1.0.1__tar.gz → 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {jseye-1.0.1/jseye.egg-info → jseye-1.0.2}/PKG-INFO +1 -1
- {jseye-1.0.1 → jseye-1.0.2}/jseye/__init__.py +1 -1
- jseye-1.0.2/jseye/__main__.py +9 -0
- jseye-1.0.2/jseye/banner.py +84 -0
- jseye-1.0.2/jseye/cli.py +182 -0
- jseye-1.0.2/jseye/modules/harvest.py +177 -0
- jseye-1.0.2/jseye/modules/js_download.py +278 -0
- jseye-1.0.2/jseye/modules/js_filter.py +241 -0
- jseye-1.0.2/jseye/modules/tiered_analysis.py +304 -0
- jseye-1.0.2/jseye/pipeline.py +348 -0
- jseye-1.0.2/jseye/utils/cache.py +241 -0
- {jseye-1.0.1 → jseye-1.0.2/jseye.egg-info}/PKG-INFO +1 -1
- {jseye-1.0.1 → jseye-1.0.2}/jseye.egg-info/SOURCES.txt +3 -0
- {jseye-1.0.1 → jseye-1.0.2}/pyproject.toml +1 -1
- jseye-1.0.1/jseye/banner.py +0 -37
- jseye-1.0.1/jseye/cli.py +0 -137
- jseye-1.0.1/jseye/modules/harvest.py +0 -124
- jseye-1.0.1/jseye/modules/js_download.py +0 -82
- jseye-1.0.1/jseye/modules/js_filter.py +0 -186
- jseye-1.0.1/jseye/pipeline.py +0 -249
- {jseye-1.0.1 → jseye-1.0.2}/LICENSE +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/MANIFEST.in +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/README.md +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/data/regex.yaml +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/data/vendor_blacklist.txt +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/installer.py +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/modules/__init__.py +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/modules/analyze_ast.py +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/modules/analyze_regex.py +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/modules/correlate.py +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/modules/linkfinder.py +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/modules/secrets.py +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/modules/sinks.py +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/utils/__init__.py +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/utils/fs.py +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/utils/hashing.py +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/utils/logger.py +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye/utils/shell.py +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye.egg-info/dependency_links.txt +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye.egg-info/entry_points.txt +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye.egg-info/requires.txt +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/jseye.egg-info/top_level.txt +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/scripts/ast_parser.js +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/setup.cfg +0 -0
- {jseye-1.0.1 → jseye-1.0.2}/setup.py +0 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""
|
|
2
|
+
JSEye Banner Display with Version Checking
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.text import Text
|
|
8
|
+
from rich.align import Align
|
|
9
|
+
from rich.panel import Panel
|
|
10
|
+
from . import __version__
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
def get_latest_pypi_version():
|
|
15
|
+
"""Get the latest version from PyPI"""
|
|
16
|
+
try:
|
|
17
|
+
response = requests.get("https://pypi.org/pypi/jseye/json", timeout=3)
|
|
18
|
+
if response.status_code == 200:
|
|
19
|
+
data = response.json()
|
|
20
|
+
return data["info"]["version"]
|
|
21
|
+
except:
|
|
22
|
+
pass
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
def show_banner():
|
|
26
|
+
"""Display the enhanced JSEye banner"""
|
|
27
|
+
# Add some top padding first
|
|
28
|
+
console.print()
|
|
29
|
+
|
|
30
|
+
# ASCII Art Banner - clean and properly aligned
|
|
31
|
+
banner = """ ▄▄▄▄▄▄ ▄▄▄▄▄ ▄▄▄▄▄▄▄
|
|
32
|
+
█▀ ██ ██▀▀▀▀█▄ █▀██▀▀▀
|
|
33
|
+
██ ▀██▄ ▄▀ ██
|
|
34
|
+
██ ▀██▄▄ ████ ██ ██ ▄█▀█▄
|
|
35
|
+
██ ▄ ▀██▄ ██ ██▄██ ██▄█▀
|
|
36
|
+
██ ▀██████▀ ▀█████▄▄▀██▀ ▀██▄▄▄
|
|
37
|
+
▄ ██ ██
|
|
38
|
+
▀████▀ ▀▀▀ """
|
|
39
|
+
|
|
40
|
+
# Display banner in cyan
|
|
41
|
+
banner_text = Text(banner, style="cyan bold")
|
|
42
|
+
console.print(Align.center(banner_text))
|
|
43
|
+
console.print()
|
|
44
|
+
|
|
45
|
+
# Tagline centered
|
|
46
|
+
tagline = Text("JavaScript Intelligence & Attack Surface Discovery", style="green bold")
|
|
47
|
+
console.print(Align.center(tagline))
|
|
48
|
+
|
|
49
|
+
# Version info with PyPI check
|
|
50
|
+
current_version = f"v{__version__}"
|
|
51
|
+
version_text = f"Current Version: {current_version}"
|
|
52
|
+
|
|
53
|
+
# Check for latest version on PyPI
|
|
54
|
+
latest_version = get_latest_pypi_version()
|
|
55
|
+
if latest_version and latest_version != __version__:
|
|
56
|
+
version_text += f" | Latest: v{latest_version} (Update available!)"
|
|
57
|
+
version_style = "yellow bold"
|
|
58
|
+
else:
|
|
59
|
+
version_style = "green"
|
|
60
|
+
|
|
61
|
+
version_info = Text(version_text, style=version_style)
|
|
62
|
+
console.print(Align.center(version_info))
|
|
63
|
+
|
|
64
|
+
# Author info
|
|
65
|
+
author = Text("Author: Lakshmikanthan K (letchupkt)", style="purple")
|
|
66
|
+
console.print(Align.center(author))
|
|
67
|
+
|
|
68
|
+
console.print()
|
|
69
|
+
|
|
70
|
+
def show_performance_banner():
|
|
71
|
+
"""Show performance-focused banner for v1.0.2"""
|
|
72
|
+
# Performance upgrade banner
|
|
73
|
+
perf_banner = Panel.fit(
|
|
74
|
+
"[bold cyan]>> JSEye v1.0.2 - Performance Revolution[/bold cyan]\n"
|
|
75
|
+
"[green][+] Parallel Tool Execution (3-5x faster)[/green]\n"
|
|
76
|
+
"[green][+] Smart JS Prioritization (60-70% time saved)[/green]\n"
|
|
77
|
+
"[green][+] Tiered Analysis Engine[/green]\n"
|
|
78
|
+
"[green][+] Comprehensive Caching System[/green]\n"
|
|
79
|
+
"[green][+] Full CPU Utilization[/green]",
|
|
80
|
+
style="cyan",
|
|
81
|
+
title="[bold white]Performance Upgrades[/bold white]"
|
|
82
|
+
)
|
|
83
|
+
console.print(Align.center(perf_banner))
|
|
84
|
+
console.print()
|
jseye-1.0.2/jseye/cli.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
JSEye CLI - Enhanced Main Entry Point
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
import argparse
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.panel import Panel
|
|
12
|
+
|
|
13
|
+
from .banner import show_banner, show_performance_banner
|
|
14
|
+
from .installer import check_and_install_tools
|
|
15
|
+
from .pipeline import JSEyePipeline
|
|
16
|
+
|
|
17
|
+
console = Console()
|
|
18
|
+
|
|
19
|
+
def create_parser():
|
|
20
|
+
"""Create enhanced argument parser"""
|
|
21
|
+
parser = argparse.ArgumentParser(
|
|
22
|
+
description="JSEye v1.0.2 - High-Performance JavaScript Intelligence Framework",
|
|
23
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
24
|
+
epilog="""
|
|
25
|
+
Examples:
|
|
26
|
+
jseye -i subs.txt -o output # Full parallel pipeline
|
|
27
|
+
jseye -i subs.txt -o output --js-only # Stop after JS discovery
|
|
28
|
+
jseye -i subs.txt -o output --no-secrets # Skip secrets detection
|
|
29
|
+
jseye -i subs.txt -o output --regex-only # Only regex analysis
|
|
30
|
+
jseye -i subs.txt -o output --performance # Show performance banner
|
|
31
|
+
"""
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Required arguments
|
|
35
|
+
parser.add_argument("-i", "--input",
|
|
36
|
+
help="Input file containing subdomains")
|
|
37
|
+
parser.add_argument("-o", "--output",
|
|
38
|
+
help="Output directory for results")
|
|
39
|
+
|
|
40
|
+
# Performance and UI options
|
|
41
|
+
parser.add_argument("--performance", action="store_true",
|
|
42
|
+
help="Show performance upgrades banner")
|
|
43
|
+
parser.add_argument("--no-banner", action="store_true",
|
|
44
|
+
help="Skip banner display")
|
|
45
|
+
|
|
46
|
+
# Module control flags
|
|
47
|
+
parser.add_argument("--js-only", action="store_true",
|
|
48
|
+
help="Stop after JavaScript discovery")
|
|
49
|
+
parser.add_argument("--no-install", action="store_true",
|
|
50
|
+
help="Do not auto-install missing tools")
|
|
51
|
+
parser.add_argument("--skip-ast", action="store_true",
|
|
52
|
+
help="Skip AST analysis")
|
|
53
|
+
parser.add_argument("--regex-only", action="store_true",
|
|
54
|
+
help="Only perform regex analysis")
|
|
55
|
+
parser.add_argument("--no-secrets", action="store_true",
|
|
56
|
+
help="Skip secrets detection")
|
|
57
|
+
parser.add_argument("--no-sinks", action="store_true",
|
|
58
|
+
help="Skip sink detection")
|
|
59
|
+
parser.add_argument("--no-correlate", action="store_true",
|
|
60
|
+
help="Skip correlation engine")
|
|
61
|
+
parser.add_argument("--list-modules", action="store_true",
|
|
62
|
+
help="Show available modules and exit")
|
|
63
|
+
|
|
64
|
+
return parser
|
|
65
|
+
|
|
66
|
+
def list_modules():
|
|
67
|
+
"""List available modules with enhanced display"""
|
|
68
|
+
modules_info = [
|
|
69
|
+
("harvest", "Parallel URL harvesting (gau, waybackurls, katana, hakrawler, subjs)", "[*]"),
|
|
70
|
+
("js_filter", "Intelligent JavaScript filtering with prioritization", "[>]"),
|
|
71
|
+
("js_download", "Parallel JavaScript file downloading with caching", "[>>]"),
|
|
72
|
+
("tiered_analysis", "Smart tiered analysis engine (T1/T2/T3)", "[#]"),
|
|
73
|
+
("analyze_regex", "Regex-based pattern analysis", "[~]"),
|
|
74
|
+
("analyze_ast", "AST-based code analysis", "[^]"),
|
|
75
|
+
("linkfinder", "Enhanced endpoint discovery", "[=]"),
|
|
76
|
+
("secrets", "Secret detection with mantra integration", "[!]"),
|
|
77
|
+
("sinks", "Vulnerability sink detection", "[?]"),
|
|
78
|
+
("correlate", "Intelligence correlation engine", "[<>]"),
|
|
79
|
+
("cache", "Comprehensive caching system", "[C]")
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
console.print()
|
|
83
|
+
panel_content = "[bold cyan]JSEye v1.0.2 - Available Modules[/bold cyan]\n\n"
|
|
84
|
+
|
|
85
|
+
for name, desc, symbol in modules_info:
|
|
86
|
+
panel_content += f"{symbol} [bold green]{name}[/bold green] - {desc}\n"
|
|
87
|
+
|
|
88
|
+
panel = Panel(panel_content.strip(), style="cyan", title="[bold white]Module Overview[/bold white]")
|
|
89
|
+
console.print(panel)
|
|
90
|
+
console.print()
|
|
91
|
+
|
|
92
|
+
def validate_environment():
|
|
93
|
+
"""Validate the runtime environment"""
|
|
94
|
+
issues = []
|
|
95
|
+
|
|
96
|
+
# Check Python version
|
|
97
|
+
if sys.version_info < (3, 10):
|
|
98
|
+
issues.append("Python 3.10+ required")
|
|
99
|
+
|
|
100
|
+
# Check write permissions for output
|
|
101
|
+
try:
|
|
102
|
+
test_dir = Path.cwd() / ".jseye_test"
|
|
103
|
+
test_dir.mkdir(exist_ok=True)
|
|
104
|
+
test_dir.rmdir()
|
|
105
|
+
except:
|
|
106
|
+
issues.append("No write permissions in current directory")
|
|
107
|
+
|
|
108
|
+
return issues
|
|
109
|
+
|
|
110
|
+
def main():
|
|
111
|
+
"""Enhanced main CLI entry point"""
|
|
112
|
+
parser = create_parser()
|
|
113
|
+
args = parser.parse_args()
|
|
114
|
+
|
|
115
|
+
# Show banner (unless disabled)
|
|
116
|
+
if not args.no_banner:
|
|
117
|
+
show_banner()
|
|
118
|
+
if args.performance:
|
|
119
|
+
show_performance_banner()
|
|
120
|
+
|
|
121
|
+
# List modules if requested
|
|
122
|
+
if args.list_modules:
|
|
123
|
+
list_modules()
|
|
124
|
+
return 0
|
|
125
|
+
|
|
126
|
+
# Validate required arguments
|
|
127
|
+
if not args.input or not args.output:
|
|
128
|
+
console.print("[red]Error: Both --input and --output are required[/red]")
|
|
129
|
+
parser.print_help()
|
|
130
|
+
return 1
|
|
131
|
+
|
|
132
|
+
# Validate environment
|
|
133
|
+
env_issues = validate_environment()
|
|
134
|
+
if env_issues:
|
|
135
|
+
console.print("[red]Environment Issues:[/red]")
|
|
136
|
+
for issue in env_issues:
|
|
137
|
+
console.print(f" • {issue}")
|
|
138
|
+
return 1
|
|
139
|
+
|
|
140
|
+
# Validate input file
|
|
141
|
+
input_path = Path(args.input)
|
|
142
|
+
if not input_path.exists():
|
|
143
|
+
console.print(f"[red]Error: Input file '{args.input}' not found[/red]")
|
|
144
|
+
return 1
|
|
145
|
+
|
|
146
|
+
# Create output directory
|
|
147
|
+
output_dir = Path(args.output)
|
|
148
|
+
try:
|
|
149
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
150
|
+
console.print(f"[green]Output directory: {output_dir.absolute()}[/green]")
|
|
151
|
+
except Exception as e:
|
|
152
|
+
console.print(f"[red]Error creating output directory: {e}[/red]")
|
|
153
|
+
return 1
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
# Check and install tools if needed
|
|
157
|
+
if not args.no_install:
|
|
158
|
+
console.print("[yellow]Checking required tools...[/yellow]")
|
|
159
|
+
if not check_and_install_tools():
|
|
160
|
+
console.print("[red]Failed to install required tools[/red]")
|
|
161
|
+
return 1
|
|
162
|
+
|
|
163
|
+
# Initialize and run enhanced pipeline
|
|
164
|
+
console.print("[bold cyan]Starting JSEye v1.0.2 High-Performance Pipeline...[/bold cyan]")
|
|
165
|
+
pipeline = JSEyePipeline(args.input, args.output, args)
|
|
166
|
+
results = pipeline.run()
|
|
167
|
+
|
|
168
|
+
# Show enhanced summary
|
|
169
|
+
pipeline.show_summary(results)
|
|
170
|
+
|
|
171
|
+
console.print("[bold green][+] JSEye execution completed successfully![/bold green]")
|
|
172
|
+
return 0
|
|
173
|
+
|
|
174
|
+
except KeyboardInterrupt:
|
|
175
|
+
console.print("\n[yellow][!] Interrupted by user[/yellow]")
|
|
176
|
+
return 1
|
|
177
|
+
except Exception as e:
|
|
178
|
+
console.print(f"[red][-] Error: {e}[/red]")
|
|
179
|
+
return 1
|
|
180
|
+
|
|
181
|
+
if __name__ == "__main__":
|
|
182
|
+
sys.exit(main())
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""
|
|
2
|
+
URL Harvesting Module - Parallel execution for maximum speed
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import subprocess
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import List, Set, Dict
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
11
|
+
|
|
12
|
+
from ..utils.logger import log_progress
|
|
13
|
+
from ..utils.hashing import DeduplicatorHash
|
|
14
|
+
|
|
15
|
+
console = Console()
|
|
16
|
+
|
|
17
|
+
class URLHarvester:
|
|
18
|
+
"""Harvest URLs from multiple sources in parallel"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, output_dir: Path):
|
|
21
|
+
self.output_dir = output_dir
|
|
22
|
+
self.deduplicator = DeduplicatorHash()
|
|
23
|
+
self.timeout = 120 # 2 minutes per tool
|
|
24
|
+
|
|
25
|
+
async def run_tool_async(self, tool_name: str, domain: str) -> List[str]:
|
|
26
|
+
"""Run a single tool asynchronously"""
|
|
27
|
+
try:
|
|
28
|
+
if tool_name == "gau":
|
|
29
|
+
cmd = ["gau", domain]
|
|
30
|
+
elif tool_name == "waybackurls":
|
|
31
|
+
cmd = ["waybackurls", domain]
|
|
32
|
+
elif tool_name == "hakrawler":
|
|
33
|
+
cmd = ["hakrawler", "-url", domain, "-depth", "2", "-plain"]
|
|
34
|
+
elif tool_name == "katana":
|
|
35
|
+
cmd = ["katana", "-u", domain, "-depth", "2", "-silent"]
|
|
36
|
+
elif tool_name == "subjs":
|
|
37
|
+
cmd = ["subjs", "-d", domain]
|
|
38
|
+
else:
|
|
39
|
+
return []
|
|
40
|
+
|
|
41
|
+
# Run tool with timeout
|
|
42
|
+
process = await asyncio.create_subprocess_exec(
|
|
43
|
+
*cmd,
|
|
44
|
+
stdout=asyncio.subprocess.PIPE,
|
|
45
|
+
stderr=asyncio.subprocess.PIPE
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
stdout, stderr = await asyncio.wait_for(
|
|
50
|
+
process.communicate(),
|
|
51
|
+
timeout=self.timeout
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if process.returncode == 0 and stdout:
|
|
55
|
+
urls = [url.strip() for url in stdout.decode('utf-8', errors='ignore').split('\n') if url.strip()]
|
|
56
|
+
log_progress(f"{tool_name}: found {len(urls)} URLs for {domain}")
|
|
57
|
+
return urls
|
|
58
|
+
else:
|
|
59
|
+
log_progress(f"{tool_name}: no results for {domain}")
|
|
60
|
+
return []
|
|
61
|
+
|
|
62
|
+
except asyncio.TimeoutError:
|
|
63
|
+
process.kill()
|
|
64
|
+
log_progress(f"{tool_name}: timeout for {domain}")
|
|
65
|
+
return []
|
|
66
|
+
|
|
67
|
+
except Exception as e:
|
|
68
|
+
log_progress(f"{tool_name}: error for {domain} - {e}")
|
|
69
|
+
return []
|
|
70
|
+
|
|
71
|
+
async def harvest_domain_parallel(self, domain: str) -> List[str]:
|
|
72
|
+
"""Harvest URLs from a single domain using all tools in parallel"""
|
|
73
|
+
tools = ["gau", "waybackurls", "hakrawler", "katana", "subjs"]
|
|
74
|
+
|
|
75
|
+
# Run all tools in parallel
|
|
76
|
+
tasks = [self.run_tool_async(tool, domain) for tool in tools]
|
|
77
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
78
|
+
|
|
79
|
+
# Collect all URLs
|
|
80
|
+
all_urls = []
|
|
81
|
+
for i, result in enumerate(results):
|
|
82
|
+
if isinstance(result, list):
|
|
83
|
+
all_urls.extend(result)
|
|
84
|
+
else:
|
|
85
|
+
log_progress(f"{tools[i]}: exception for {domain} - {result}")
|
|
86
|
+
|
|
87
|
+
return all_urls
|
|
88
|
+
|
|
89
|
+
async def harvest_all_domains_parallel(self, domains: List[str]) -> List[str]:
|
|
90
|
+
"""Harvest URLs from all domains in parallel"""
|
|
91
|
+
log_progress(f"Harvesting URLs from {len(domains)} domains in parallel")
|
|
92
|
+
|
|
93
|
+
# Limit concurrent domains to avoid overwhelming the system
|
|
94
|
+
semaphore = asyncio.Semaphore(3) # Max 3 domains at once
|
|
95
|
+
|
|
96
|
+
async def harvest_with_semaphore(domain):
|
|
97
|
+
async with semaphore:
|
|
98
|
+
return await self.harvest_domain_parallel(domain)
|
|
99
|
+
|
|
100
|
+
# Run domain harvesting in parallel
|
|
101
|
+
tasks = [harvest_with_semaphore(domain) for domain in domains]
|
|
102
|
+
domain_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
103
|
+
|
|
104
|
+
# Collect all URLs
|
|
105
|
+
all_urls = []
|
|
106
|
+
for i, result in enumerate(domain_results):
|
|
107
|
+
if isinstance(result, list):
|
|
108
|
+
all_urls.extend(result)
|
|
109
|
+
log_progress(f"Domain {domains[i]}: {len(result)} URLs")
|
|
110
|
+
else:
|
|
111
|
+
log_progress(f"Domain {domains[i]}: failed - {result}")
|
|
112
|
+
|
|
113
|
+
return all_urls
|
|
114
|
+
|
|
115
|
+
def harvest_urls(self, domains: List[str]) -> List[str]:
|
|
116
|
+
"""
|
|
117
|
+
Harvest URLs from all sources in parallel (main entry point)
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
domains: List of domains to harvest
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Deduplicated list of URLs
|
|
124
|
+
"""
|
|
125
|
+
log_progress(">> Starting PARALLEL URL harvesting (gau, waybackurls, hakrawler, katana, subjs)")
|
|
126
|
+
|
|
127
|
+
# Run async harvesting
|
|
128
|
+
try:
|
|
129
|
+
all_urls = asyncio.run(self.harvest_all_domains_parallel(domains))
|
|
130
|
+
except Exception as e:
|
|
131
|
+
log_progress(f"Parallel harvesting failed, falling back to sequential: {e}")
|
|
132
|
+
return self.harvest_urls_sequential(domains)
|
|
133
|
+
|
|
134
|
+
# Deduplicate URLs
|
|
135
|
+
unique_urls = self.deduplicator.deduplicate_list(all_urls)
|
|
136
|
+
|
|
137
|
+
log_progress(f">> PARALLEL harvest complete: {len(unique_urls)} unique URLs from {len(all_urls)} total")
|
|
138
|
+
|
|
139
|
+
# Save raw URLs
|
|
140
|
+
urls_file = self.output_dir / "harvested_urls.txt"
|
|
141
|
+
with open(urls_file, 'w') as f:
|
|
142
|
+
for url in unique_urls:
|
|
143
|
+
f.write(f"{url}\n")
|
|
144
|
+
|
|
145
|
+
return unique_urls
|
|
146
|
+
|
|
147
|
+
def harvest_urls_sequential(self, domains: List[str]) -> List[str]:
|
|
148
|
+
"""Fallback sequential harvesting if parallel fails"""
|
|
149
|
+
log_progress("Running sequential URL harvesting (fallback)")
|
|
150
|
+
|
|
151
|
+
all_urls = []
|
|
152
|
+
tools = [
|
|
153
|
+
("gau", lambda d: ["gau", d]),
|
|
154
|
+
("waybackurls", lambda d: ["waybackurls", d]),
|
|
155
|
+
("hakrawler", lambda d: ["hakrawler", "-url", d, "-depth", "2", "-plain"]),
|
|
156
|
+
("katana", lambda d: ["katana", "-u", d, "-depth", "2", "-silent"]),
|
|
157
|
+
("subjs", lambda d: ["subjs", "-d", d])
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
for domain in domains:
|
|
161
|
+
for tool_name, cmd_func in tools:
|
|
162
|
+
try:
|
|
163
|
+
result = subprocess.run(
|
|
164
|
+
cmd_func(domain),
|
|
165
|
+
capture_output=True,
|
|
166
|
+
text=True,
|
|
167
|
+
timeout=self.timeout
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
if result.returncode == 0 and result.stdout:
|
|
171
|
+
urls = [url.strip() for url in result.stdout.split('\n') if url.strip()]
|
|
172
|
+
all_urls.extend(urls)
|
|
173
|
+
|
|
174
|
+
except Exception as e:
|
|
175
|
+
log_progress(f"{tool_name} failed for {domain}: {e}")
|
|
176
|
+
|
|
177
|
+
return self.deduplicator.deduplicate_list(all_urls)
|