jseye 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jseye/__init__.py +1 -1
- jseye/__main__.py +9 -0
- jseye/banner.py +59 -12
- jseye/cli.py +87 -42
- jseye/installer.py +2 -5
- jseye/modules/harvest.py +125 -72
- jseye/modules/js_download.py +235 -39
- jseye/modules/js_filter.py +156 -101
- jseye/modules/linkfinder.py +337 -27
- jseye/modules/tiered_analysis.py +304 -0
- jseye/pipeline.py +188 -70
- jseye/utils/cache.py +241 -0
- {jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/METADATA +2 -2
- jseye-1.0.2.dist-info/RECORD +31 -0
- jseye-1.0.0.dist-info/RECORD +0 -28
- {jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/WHEEL +0 -0
- {jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/entry_points.txt +0 -0
- {jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/licenses/LICENSE +0 -0
- {jseye-1.0.0.dist-info → jseye-1.0.2.dist-info}/top_level.txt +0 -0
jseye/__init__.py
CHANGED
jseye/__main__.py
ADDED
jseye/banner.py
CHANGED
|
@@ -1,37 +1,84 @@
|
|
|
1
1
|
"""
|
|
2
|
-
JSEye Banner Display
|
|
2
|
+
JSEye Banner Display with Version Checking
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import requests
|
|
5
6
|
from rich.console import Console
|
|
6
7
|
from rich.text import Text
|
|
7
8
|
from rich.align import Align
|
|
9
|
+
from rich.panel import Panel
|
|
10
|
+
from . import __version__
|
|
8
11
|
|
|
9
12
|
console = Console()
|
|
10
13
|
|
|
14
|
+
def get_latest_pypi_version():
|
|
15
|
+
"""Get the latest version from PyPI"""
|
|
16
|
+
try:
|
|
17
|
+
response = requests.get("https://pypi.org/pypi/jseye/json", timeout=3)
|
|
18
|
+
if response.status_code == 200:
|
|
19
|
+
data = response.json()
|
|
20
|
+
return data["info"]["version"]
|
|
21
|
+
except:
|
|
22
|
+
pass
|
|
23
|
+
return None
|
|
24
|
+
|
|
11
25
|
def show_banner():
|
|
12
|
-
"""Display the JSEye banner
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
26
|
+
"""Display the enhanced JSEye banner"""
|
|
27
|
+
# Add some top padding first
|
|
28
|
+
console.print()
|
|
29
|
+
|
|
30
|
+
# ASCII Art Banner - clean and properly aligned
|
|
31
|
+
banner = """ ▄▄▄▄▄▄ ▄▄▄▄▄ ▄▄▄▄▄▄▄
|
|
16
32
|
█▀ ██ ██▀▀▀▀█▄ █▀██▀▀▀
|
|
17
33
|
██ ▀██▄ ▄▀ ██
|
|
18
34
|
██ ▀██▄▄ ████ ██ ██ ▄█▀█▄
|
|
19
35
|
██ ▄ ▀██▄ ██ ██▄██ ██▄█▀
|
|
20
|
-
██ ▀██████▀
|
|
36
|
+
██ ▀██████▀ ▀█████▄▄▀██▀ ▀██▄▄▄
|
|
21
37
|
▄ ██ ██
|
|
22
|
-
▀████▀ ▀▀▀
|
|
23
|
-
"""
|
|
38
|
+
▀████▀ ▀▀▀ """
|
|
24
39
|
|
|
25
|
-
# Display banner in cyan
|
|
26
|
-
banner_text = Text(banner
|
|
40
|
+
# Display banner in cyan
|
|
41
|
+
banner_text = Text(banner, style="cyan bold")
|
|
27
42
|
console.print(Align.center(banner_text))
|
|
28
43
|
console.print()
|
|
29
44
|
|
|
30
|
-
# Tagline
|
|
31
|
-
tagline = Text("
|
|
45
|
+
# Tagline centered
|
|
46
|
+
tagline = Text("JavaScript Intelligence & Attack Surface Discovery", style="green bold")
|
|
32
47
|
console.print(Align.center(tagline))
|
|
33
48
|
|
|
49
|
+
# Version info with PyPI check
|
|
50
|
+
current_version = f"v{__version__}"
|
|
51
|
+
version_text = f"Current Version: {current_version}"
|
|
52
|
+
|
|
53
|
+
# Check for latest version on PyPI
|
|
54
|
+
latest_version = get_latest_pypi_version()
|
|
55
|
+
if latest_version and latest_version != __version__:
|
|
56
|
+
version_text += f" | Latest: v{latest_version} (Update available!)"
|
|
57
|
+
version_style = "yellow bold"
|
|
58
|
+
else:
|
|
59
|
+
version_style = "green"
|
|
60
|
+
|
|
61
|
+
version_info = Text(version_text, style=version_style)
|
|
62
|
+
console.print(Align.center(version_info))
|
|
63
|
+
|
|
64
|
+
# Author info
|
|
34
65
|
author = Text("Author: Lakshmikanthan K (letchupkt)", style="purple")
|
|
35
66
|
console.print(Align.center(author))
|
|
36
67
|
|
|
68
|
+
console.print()
|
|
69
|
+
|
|
70
|
+
def show_performance_banner():
|
|
71
|
+
"""Show performance-focused banner for v1.0.2"""
|
|
72
|
+
# Performance upgrade banner
|
|
73
|
+
perf_banner = Panel.fit(
|
|
74
|
+
"[bold cyan]>> JSEye v1.0.2 - Performance Revolution[/bold cyan]\n"
|
|
75
|
+
"[green][+] Parallel Tool Execution (3-5x faster)[/green]\n"
|
|
76
|
+
"[green][+] Smart JS Prioritization (60-70% time saved)[/green]\n"
|
|
77
|
+
"[green][+] Tiered Analysis Engine[/green]\n"
|
|
78
|
+
"[green][+] Comprehensive Caching System[/green]\n"
|
|
79
|
+
"[green][+] Full CPU Utilization[/green]",
|
|
80
|
+
style="cyan",
|
|
81
|
+
title="[bold white]Performance Upgrades[/bold white]"
|
|
82
|
+
)
|
|
83
|
+
console.print(Align.center(perf_banner))
|
|
37
84
|
console.print()
|
jseye/cli.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""
|
|
3
|
-
JSEye CLI - Main
|
|
3
|
+
JSEye CLI - Enhanced Main Entry Point
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
import os
|
|
@@ -8,42 +8,42 @@ import sys
|
|
|
8
8
|
import argparse
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from rich.console import Console
|
|
11
|
+
from rich.panel import Panel
|
|
11
12
|
|
|
12
|
-
from .banner import show_banner
|
|
13
|
+
from .banner import show_banner, show_performance_banner
|
|
13
14
|
from .installer import check_and_install_tools
|
|
14
15
|
from .pipeline import JSEyePipeline
|
|
15
16
|
|
|
16
17
|
console = Console()
|
|
17
18
|
|
|
18
|
-
def clear_terminal():
|
|
19
|
-
"""Clear terminal screen (cross-platform)"""
|
|
20
|
-
import platform
|
|
21
|
-
if platform.system() == "Windows":
|
|
22
|
-
os.system("cls")
|
|
23
|
-
else:
|
|
24
|
-
os.system("clear")
|
|
25
|
-
|
|
26
19
|
def create_parser():
|
|
27
|
-
"""Create argument parser"""
|
|
20
|
+
"""Create enhanced argument parser"""
|
|
28
21
|
parser = argparse.ArgumentParser(
|
|
29
|
-
description="JSEye - JavaScript Intelligence
|
|
22
|
+
description="JSEye v1.0.2 - High-Performance JavaScript Intelligence Framework",
|
|
30
23
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
31
24
|
epilog="""
|
|
32
25
|
Examples:
|
|
33
|
-
jseye -i subs.txt -o output # Full pipeline
|
|
26
|
+
jseye -i subs.txt -o output # Full parallel pipeline
|
|
34
27
|
jseye -i subs.txt -o output --js-only # Stop after JS discovery
|
|
35
28
|
jseye -i subs.txt -o output --no-secrets # Skip secrets detection
|
|
36
29
|
jseye -i subs.txt -o output --regex-only # Only regex analysis
|
|
30
|
+
jseye -i subs.txt -o output --performance # Show performance banner
|
|
37
31
|
"""
|
|
38
32
|
)
|
|
39
33
|
|
|
40
|
-
# Required arguments
|
|
34
|
+
# Required arguments
|
|
41
35
|
parser.add_argument("-i", "--input",
|
|
42
36
|
help="Input file containing subdomains")
|
|
43
37
|
parser.add_argument("-o", "--output",
|
|
44
38
|
help="Output directory for results")
|
|
45
39
|
|
|
46
|
-
#
|
|
40
|
+
# Performance and UI options
|
|
41
|
+
parser.add_argument("--performance", action="store_true",
|
|
42
|
+
help="Show performance upgrades banner")
|
|
43
|
+
parser.add_argument("--no-banner", action="store_true",
|
|
44
|
+
help="Skip banner display")
|
|
45
|
+
|
|
46
|
+
# Module control flags
|
|
47
47
|
parser.add_argument("--js-only", action="store_true",
|
|
48
48
|
help="Stop after JavaScript discovery")
|
|
49
49
|
parser.add_argument("--no-install", action="store_true",
|
|
@@ -53,7 +53,7 @@ Examples:
|
|
|
53
53
|
parser.add_argument("--regex-only", action="store_true",
|
|
54
54
|
help="Only perform regex analysis")
|
|
55
55
|
parser.add_argument("--no-secrets", action="store_true",
|
|
56
|
-
help="Skip secrets detection
|
|
56
|
+
help="Skip secrets detection")
|
|
57
57
|
parser.add_argument("--no-sinks", action="store_true",
|
|
58
58
|
help="Skip sink detection")
|
|
59
59
|
parser.add_argument("--no-correlate", action="store_true",
|
|
@@ -64,50 +64,93 @@ Examples:
|
|
|
64
64
|
return parser
|
|
65
65
|
|
|
66
66
|
def list_modules():
|
|
67
|
-
"""List available modules"""
|
|
68
|
-
|
|
69
|
-
"harvest
|
|
70
|
-
"js_filter
|
|
71
|
-
"js_download
|
|
72
|
-
"
|
|
73
|
-
"
|
|
74
|
-
"
|
|
75
|
-
"
|
|
76
|
-
"
|
|
77
|
-
"
|
|
67
|
+
"""List available modules with enhanced display"""
|
|
68
|
+
modules_info = [
|
|
69
|
+
("harvest", "Parallel URL harvesting (gau, waybackurls, katana, hakrawler, subjs)", "[*]"),
|
|
70
|
+
("js_filter", "Intelligent JavaScript filtering with prioritization", "[>]"),
|
|
71
|
+
("js_download", "Parallel JavaScript file downloading with caching", "[>>]"),
|
|
72
|
+
("tiered_analysis", "Smart tiered analysis engine (T1/T2/T3)", "[#]"),
|
|
73
|
+
("analyze_regex", "Regex-based pattern analysis", "[~]"),
|
|
74
|
+
("analyze_ast", "AST-based code analysis", "[^]"),
|
|
75
|
+
("linkfinder", "Enhanced endpoint discovery", "[=]"),
|
|
76
|
+
("secrets", "Secret detection with mantra integration", "[!]"),
|
|
77
|
+
("sinks", "Vulnerability sink detection", "[?]"),
|
|
78
|
+
("correlate", "Intelligence correlation engine", "[<>]"),
|
|
79
|
+
("cache", "Comprehensive caching system", "[C]")
|
|
78
80
|
]
|
|
79
81
|
|
|
80
|
-
console.print("\n[bold cyan]Available JSEye Modules:[/bold cyan]")
|
|
81
|
-
for module in modules:
|
|
82
|
-
console.print(f" • {module}")
|
|
83
82
|
console.print()
|
|
83
|
+
panel_content = "[bold cyan]JSEye v1.0.2 - Available Modules[/bold cyan]\n\n"
|
|
84
|
+
|
|
85
|
+
for name, desc, symbol in modules_info:
|
|
86
|
+
panel_content += f"{symbol} [bold green]{name}[/bold green] - {desc}\n"
|
|
87
|
+
|
|
88
|
+
panel = Panel(panel_content.strip(), style="cyan", title="[bold white]Module Overview[/bold white]")
|
|
89
|
+
console.print(panel)
|
|
90
|
+
console.print()
|
|
91
|
+
|
|
92
|
+
def validate_environment():
|
|
93
|
+
"""Validate the runtime environment"""
|
|
94
|
+
issues = []
|
|
95
|
+
|
|
96
|
+
# Check Python version
|
|
97
|
+
if sys.version_info < (3, 10):
|
|
98
|
+
issues.append("Python 3.10+ required")
|
|
99
|
+
|
|
100
|
+
# Check write permissions for output
|
|
101
|
+
try:
|
|
102
|
+
test_dir = Path.cwd() / ".jseye_test"
|
|
103
|
+
test_dir.mkdir(exist_ok=True)
|
|
104
|
+
test_dir.rmdir()
|
|
105
|
+
except:
|
|
106
|
+
issues.append("No write permissions in current directory")
|
|
107
|
+
|
|
108
|
+
return issues
|
|
84
109
|
|
|
85
110
|
def main():
|
|
86
|
-
"""
|
|
111
|
+
"""Enhanced main CLI entry point"""
|
|
87
112
|
parser = create_parser()
|
|
88
113
|
args = parser.parse_args()
|
|
89
114
|
|
|
90
|
-
#
|
|
91
|
-
|
|
92
|
-
|
|
115
|
+
# Show banner (unless disabled)
|
|
116
|
+
if not args.no_banner:
|
|
117
|
+
show_banner()
|
|
118
|
+
if args.performance:
|
|
119
|
+
show_performance_banner()
|
|
93
120
|
|
|
94
121
|
# List modules if requested
|
|
95
122
|
if args.list_modules:
|
|
96
123
|
list_modules()
|
|
97
124
|
return 0
|
|
98
125
|
|
|
99
|
-
# Validate required arguments
|
|
126
|
+
# Validate required arguments
|
|
100
127
|
if not args.input or not args.output:
|
|
101
|
-
|
|
128
|
+
console.print("[red]Error: Both --input and --output are required[/red]")
|
|
129
|
+
parser.print_help()
|
|
130
|
+
return 1
|
|
131
|
+
|
|
132
|
+
# Validate environment
|
|
133
|
+
env_issues = validate_environment()
|
|
134
|
+
if env_issues:
|
|
135
|
+
console.print("[red]Environment Issues:[/red]")
|
|
136
|
+
for issue in env_issues:
|
|
137
|
+
console.print(f" • {issue}")
|
|
138
|
+
return 1
|
|
102
139
|
|
|
103
140
|
# Validate input file
|
|
104
|
-
|
|
141
|
+
input_path = Path(args.input)
|
|
142
|
+
if not input_path.exists():
|
|
105
143
|
console.print(f"[red]Error: Input file '{args.input}' not found[/red]")
|
|
106
144
|
return 1
|
|
107
145
|
|
|
108
146
|
# Create output directory
|
|
109
147
|
output_dir = Path(args.output)
|
|
110
|
-
|
|
148
|
+
try:
|
|
149
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
150
|
+
console.print(f"[green]Output directory: {output_dir.absolute()}[/green]")
|
|
151
|
+
except Exception as e:
|
|
152
|
+
console.print(f"[red]Error creating output directory: {e}[/red]")
|
|
153
|
+
return 1
|
|
111
154
|
|
|
112
155
|
try:
|
|
113
156
|
# Check and install tools if needed
|
|
@@ -117,20 +160,22 @@ def main():
|
|
|
117
160
|
console.print("[red]Failed to install required tools[/red]")
|
|
118
161
|
return 1
|
|
119
162
|
|
|
120
|
-
# Initialize and run pipeline
|
|
163
|
+
# Initialize and run enhanced pipeline
|
|
164
|
+
console.print("[bold cyan]Starting JSEye v1.0.2 High-Performance Pipeline...[/bold cyan]")
|
|
121
165
|
pipeline = JSEyePipeline(args.input, args.output, args)
|
|
122
166
|
results = pipeline.run()
|
|
123
167
|
|
|
124
|
-
# Show summary
|
|
168
|
+
# Show enhanced summary
|
|
125
169
|
pipeline.show_summary(results)
|
|
126
170
|
|
|
171
|
+
console.print("[bold green][+] JSEye execution completed successfully![/bold green]")
|
|
127
172
|
return 0
|
|
128
173
|
|
|
129
174
|
except KeyboardInterrupt:
|
|
130
|
-
console.print("\n[yellow]Interrupted by user[/yellow]")
|
|
175
|
+
console.print("\n[yellow][!] Interrupted by user[/yellow]")
|
|
131
176
|
return 1
|
|
132
177
|
except Exception as e:
|
|
133
|
-
console.print(f"[red]Error: {e}[/red]")
|
|
178
|
+
console.print(f"[red][-] Error: {e}[/red]")
|
|
134
179
|
return 1
|
|
135
180
|
|
|
136
181
|
if __name__ == "__main__":
|
jseye/installer.py
CHANGED
|
@@ -37,11 +37,6 @@ REQUIRED_TOOLS = {
|
|
|
37
37
|
"install_cmd": ["go", "install", "github.com/lc/subjs@latest"],
|
|
38
38
|
"requires": "go"
|
|
39
39
|
},
|
|
40
|
-
"linkfinder": {
|
|
41
|
-
"check_cmd": ["python3", "-c", "import linkfinder"],
|
|
42
|
-
"install_cmd": ["pip3", "install", "linkfinder"],
|
|
43
|
-
"requires": "python3"
|
|
44
|
-
},
|
|
45
40
|
"mantra": {
|
|
46
41
|
"check_cmd": ["mantra", "--help"],
|
|
47
42
|
"install_cmd": ["go", "install", "github.com/MrEmpy/mantra@latest"],
|
|
@@ -170,6 +165,8 @@ def check_and_install_tools():
|
|
|
170
165
|
console.print("\n[green]All tools installed successfully![/green]")
|
|
171
166
|
return True
|
|
172
167
|
|
|
168
|
+
def install_linkfinder_special():
|
|
169
|
+
"""Special installation method for LinkFinder"""
|
|
173
170
|
def get_install_cache_path():
|
|
174
171
|
"""Get path for install cache"""
|
|
175
172
|
cache_dir = Path.home() / ".jseye"
|
jseye/modules/harvest.py
CHANGED
|
@@ -1,83 +1,120 @@
|
|
|
1
1
|
"""
|
|
2
|
-
URL Harvesting Module -
|
|
2
|
+
URL Harvesting Module - Parallel execution for maximum speed
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import asyncio
|
|
6
|
+
import subprocess
|
|
5
7
|
from pathlib import Path
|
|
6
|
-
from typing import List, Set
|
|
8
|
+
from typing import List, Set, Dict
|
|
7
9
|
from rich.console import Console
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
8
11
|
|
|
9
|
-
from ..utils.shell import run_command, run_with_input_file
|
|
10
12
|
from ..utils.logger import log_progress
|
|
11
13
|
from ..utils.hashing import DeduplicatorHash
|
|
12
14
|
|
|
13
15
|
console = Console()
|
|
14
16
|
|
|
15
17
|
class URLHarvester:
|
|
16
|
-
"""Harvest URLs from multiple sources"""
|
|
18
|
+
"""Harvest URLs from multiple sources in parallel"""
|
|
17
19
|
|
|
18
20
|
def __init__(self, output_dir: Path):
|
|
19
21
|
self.output_dir = output_dir
|
|
20
22
|
self.deduplicator = DeduplicatorHash()
|
|
23
|
+
self.timeout = 120 # 2 minutes per tool
|
|
21
24
|
|
|
22
|
-
def
|
|
23
|
-
"""Run
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
25
|
+
async def run_tool_async(self, tool_name: str, domain: str) -> List[str]:
|
|
26
|
+
"""Run a single tool asynchronously"""
|
|
27
|
+
try:
|
|
28
|
+
if tool_name == "gau":
|
|
29
|
+
cmd = ["gau", domain]
|
|
30
|
+
elif tool_name == "waybackurls":
|
|
31
|
+
cmd = ["waybackurls", domain]
|
|
32
|
+
elif tool_name == "hakrawler":
|
|
33
|
+
cmd = ["hakrawler", "-url", domain, "-depth", "2", "-plain"]
|
|
34
|
+
elif tool_name == "katana":
|
|
35
|
+
cmd = ["katana", "-u", domain, "-depth", "2", "-silent"]
|
|
36
|
+
elif tool_name == "subjs":
|
|
37
|
+
cmd = ["subjs", "-d", domain]
|
|
38
|
+
else:
|
|
39
|
+
return []
|
|
40
|
+
|
|
41
|
+
# Run tool with timeout
|
|
42
|
+
process = await asyncio.create_subprocess_exec(
|
|
43
|
+
*cmd,
|
|
44
|
+
stdout=asyncio.subprocess.PIPE,
|
|
45
|
+
stderr=asyncio.subprocess.PIPE
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
stdout, stderr = await asyncio.wait_for(
|
|
50
|
+
process.communicate(),
|
|
51
|
+
timeout=self.timeout
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if process.returncode == 0 and stdout:
|
|
55
|
+
urls = [url.strip() for url in stdout.decode('utf-8', errors='ignore').split('\n') if url.strip()]
|
|
56
|
+
log_progress(f"{tool_name}: found {len(urls)} URLs for {domain}")
|
|
57
|
+
return urls
|
|
58
|
+
else:
|
|
59
|
+
log_progress(f"{tool_name}: no results for {domain}")
|
|
60
|
+
return []
|
|
61
|
+
|
|
62
|
+
except asyncio.TimeoutError:
|
|
63
|
+
process.kill()
|
|
64
|
+
log_progress(f"{tool_name}: timeout for {domain}")
|
|
65
|
+
return []
|
|
66
|
+
|
|
67
|
+
except Exception as e:
|
|
68
|
+
log_progress(f"{tool_name}: error for {domain} - {e}")
|
|
69
|
+
return []
|
|
34
70
|
|
|
35
|
-
def
|
|
36
|
-
"""
|
|
37
|
-
|
|
71
|
+
async def harvest_domain_parallel(self, domain: str) -> List[str]:
|
|
72
|
+
"""Harvest URLs from a single domain using all tools in parallel"""
|
|
73
|
+
tools = ["gau", "waybackurls", "hakrawler", "katana", "subjs"]
|
|
38
74
|
|
|
75
|
+
# Run all tools in parallel
|
|
76
|
+
tasks = [self.run_tool_async(tool, domain) for tool in tools]
|
|
77
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
78
|
+
|
|
79
|
+
# Collect all URLs
|
|
39
80
|
all_urls = []
|
|
40
|
-
for
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
81
|
+
for i, result in enumerate(results):
|
|
82
|
+
if isinstance(result, list):
|
|
83
|
+
all_urls.extend(result)
|
|
84
|
+
else:
|
|
85
|
+
log_progress(f"{tools[i]}: exception for {domain} - {result}")
|
|
45
86
|
|
|
46
87
|
return all_urls
|
|
47
88
|
|
|
48
|
-
def
|
|
49
|
-
"""
|
|
50
|
-
log_progress("
|
|
89
|
+
async def harvest_all_domains_parallel(self, domains: List[str]) -> List[str]:
|
|
90
|
+
"""Harvest URLs from all domains in parallel"""
|
|
91
|
+
log_progress(f"Harvesting URLs from {len(domains)} domains in parallel")
|
|
51
92
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
success, stdout, stderr = run_command([
|
|
55
|
-
"hakrawler", "-url", domain, "-depth", "2", "-plain"
|
|
56
|
-
])
|
|
57
|
-
if success and stdout:
|
|
58
|
-
urls = [url.strip() for url in stdout.split('\n') if url.strip()]
|
|
59
|
-
all_urls.extend(urls)
|
|
93
|
+
# Limit concurrent domains to avoid overwhelming the system
|
|
94
|
+
semaphore = asyncio.Semaphore(3) # Max 3 domains at once
|
|
60
95
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
"""Run katana tool"""
|
|
65
|
-
log_progress("Running katana...")
|
|
96
|
+
async def harvest_with_semaphore(domain):
|
|
97
|
+
async with semaphore:
|
|
98
|
+
return await self.harvest_domain_parallel(domain)
|
|
66
99
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
input_data=domain_input
|
|
71
|
-
)
|
|
100
|
+
# Run domain harvesting in parallel
|
|
101
|
+
tasks = [harvest_with_semaphore(domain) for domain in domains]
|
|
102
|
+
domain_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
72
103
|
|
|
73
|
-
|
|
74
|
-
|
|
104
|
+
# Collect all URLs
|
|
105
|
+
all_urls = []
|
|
106
|
+
for i, result in enumerate(domain_results):
|
|
107
|
+
if isinstance(result, list):
|
|
108
|
+
all_urls.extend(result)
|
|
109
|
+
log_progress(f"Domain {domains[i]}: {len(result)} URLs")
|
|
110
|
+
else:
|
|
111
|
+
log_progress(f"Domain {domains[i]}: failed - {result}")
|
|
75
112
|
|
|
76
|
-
return
|
|
113
|
+
return all_urls
|
|
77
114
|
|
|
78
115
|
def harvest_urls(self, domains: List[str]) -> List[str]:
|
|
79
116
|
"""
|
|
80
|
-
Harvest URLs from all sources
|
|
117
|
+
Harvest URLs from all sources in parallel (main entry point)
|
|
81
118
|
|
|
82
119
|
Args:
|
|
83
120
|
domains: List of domains to harvest
|
|
@@ -85,35 +122,19 @@ class URLHarvester:
|
|
|
85
122
|
Returns:
|
|
86
123
|
Deduplicated list of URLs
|
|
87
124
|
"""
|
|
88
|
-
log_progress("
|
|
89
|
-
|
|
90
|
-
all_urls = []
|
|
91
|
-
|
|
92
|
-
# Run all harvesting tools
|
|
93
|
-
try:
|
|
94
|
-
all_urls.extend(self.run_gau(domains))
|
|
95
|
-
except Exception as e:
|
|
96
|
-
console.print(f"[yellow]Warning: gau failed - {e}[/yellow]")
|
|
97
|
-
|
|
98
|
-
try:
|
|
99
|
-
all_urls.extend(self.run_waybackurls(domains))
|
|
100
|
-
except Exception as e:
|
|
101
|
-
console.print(f"[yellow]Warning: waybackurls failed - {e}[/yellow]")
|
|
125
|
+
log_progress(">> Starting PARALLEL URL harvesting (gau, waybackurls, hakrawler, katana, subjs)")
|
|
102
126
|
|
|
127
|
+
# Run async harvesting
|
|
103
128
|
try:
|
|
104
|
-
all_urls.
|
|
129
|
+
all_urls = asyncio.run(self.harvest_all_domains_parallel(domains))
|
|
105
130
|
except Exception as e:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
try:
|
|
109
|
-
all_urls.extend(self.run_katana(domains))
|
|
110
|
-
except Exception as e:
|
|
111
|
-
console.print(f"[yellow]Warning: katana failed - {e}[/yellow]")
|
|
131
|
+
log_progress(f"Parallel harvesting failed, falling back to sequential: {e}")
|
|
132
|
+
return self.harvest_urls_sequential(domains)
|
|
112
133
|
|
|
113
134
|
# Deduplicate URLs
|
|
114
135
|
unique_urls = self.deduplicator.deduplicate_list(all_urls)
|
|
115
136
|
|
|
116
|
-
log_progress(f"
|
|
137
|
+
log_progress(f">> PARALLEL harvest complete: {len(unique_urls)} unique URLs from {len(all_urls)} total")
|
|
117
138
|
|
|
118
139
|
# Save raw URLs
|
|
119
140
|
urls_file = self.output_dir / "harvested_urls.txt"
|
|
@@ -121,4 +142,36 @@ class URLHarvester:
|
|
|
121
142
|
for url in unique_urls:
|
|
122
143
|
f.write(f"{url}\n")
|
|
123
144
|
|
|
124
|
-
return unique_urls
|
|
145
|
+
return unique_urls
|
|
146
|
+
|
|
147
|
+
def harvest_urls_sequential(self, domains: List[str]) -> List[str]:
|
|
148
|
+
"""Fallback sequential harvesting if parallel fails"""
|
|
149
|
+
log_progress("Running sequential URL harvesting (fallback)")
|
|
150
|
+
|
|
151
|
+
all_urls = []
|
|
152
|
+
tools = [
|
|
153
|
+
("gau", lambda d: ["gau", d]),
|
|
154
|
+
("waybackurls", lambda d: ["waybackurls", d]),
|
|
155
|
+
("hakrawler", lambda d: ["hakrawler", "-url", d, "-depth", "2", "-plain"]),
|
|
156
|
+
("katana", lambda d: ["katana", "-u", d, "-depth", "2", "-silent"]),
|
|
157
|
+
("subjs", lambda d: ["subjs", "-d", d])
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
for domain in domains:
|
|
161
|
+
for tool_name, cmd_func in tools:
|
|
162
|
+
try:
|
|
163
|
+
result = subprocess.run(
|
|
164
|
+
cmd_func(domain),
|
|
165
|
+
capture_output=True,
|
|
166
|
+
text=True,
|
|
167
|
+
timeout=self.timeout
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
if result.returncode == 0 and result.stdout:
|
|
171
|
+
urls = [url.strip() for url in result.stdout.split('\n') if url.strip()]
|
|
172
|
+
all_urls.extend(urls)
|
|
173
|
+
|
|
174
|
+
except Exception as e:
|
|
175
|
+
log_progress(f"{tool_name} failed for {domain}: {e}")
|
|
176
|
+
|
|
177
|
+
return self.deduplicator.deduplicate_list(all_urls)
|