execution-agent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. execution_agent/__init__.py +8 -0
  2. execution_agent/__main__.py +5 -0
  3. execution_agent/agent.py +955 -0
  4. execution_agent/commands_interface.json +7 -0
  5. execution_agent/config.py +21 -0
  6. execution_agent/context.py +1565 -0
  7. execution_agent/docker_helpers_static.py +593 -0
  8. execution_agent/env.py +61 -0
  9. execution_agent/exceptions.py +17 -0
  10. execution_agent/exit_artifacts.py +350 -0
  11. execution_agent/main.py +1234 -0
  12. execution_agent/prompt_files/c_guidelines +481 -0
  13. execution_agent/prompt_files/command_stuck +7 -0
  14. execution_agent/prompt_files/cpp_guidelines +481 -0
  15. execution_agent/prompt_files/cycle_instruction +51 -0
  16. execution_agent/prompt_files/java_guidelines +37 -0
  17. execution_agent/prompt_files/javascript_guidelines +69 -0
  18. execution_agent/prompt_files/latest_containter_technology +7 -0
  19. execution_agent/prompt_files/python_guidelines +48 -0
  20. execution_agent/prompt_files/remove_progress_bars +1 -0
  21. execution_agent/prompt_files/rust_guidelines +53 -0
  22. execution_agent/prompt_files/search_workflows_summary +121 -0
  23. execution_agent/prompt_files/steps_list.json +32 -0
  24. execution_agent/prompt_files/summarize_cycle +13 -0
  25. execution_agent/prompt_files/tools_list +99 -0
  26. execution_agent/prompt_logging.py +311 -0
  27. execution_agent/repetition.py +39 -0
  28. execution_agent/shared_utils.py +507 -0
  29. execution_agent/state_persistence.py +286 -0
  30. execution_agent/tools.py +1611 -0
  31. execution_agent/trace_to_bash.py +281 -0
  32. execution_agent-0.1.0.dist-info/METADATA +231 -0
  33. execution_agent-0.1.0.dist-info/RECORD +37 -0
  34. execution_agent-0.1.0.dist-info/WHEEL +5 -0
  35. execution_agent-0.1.0.dist-info/entry_points.txt +2 -0
  36. execution_agent-0.1.0.dist-info/licenses/LICENSE.md +46 -0
  37. execution_agent-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1565 @@
1
+ # execution_agent/context.py
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import logging
6
+ import os
7
+ import re
8
+ import subprocess
9
+ import time
10
+ from dataclasses import dataclass
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+ from urllib.parse import quote_plus
13
+
14
+ try:
15
+ import yaml
16
+ except ImportError:
17
+ yaml = None # type: ignore
18
+
19
+ try:
20
+ import requests
21
+ except ImportError:
22
+ requests = None # type: ignore
23
+
24
+ _LOG = logging.getLogger("execution_agent.context")
25
+
26
+
27
+ def _llm_filter_cicd_file(
28
+ file_path: str,
29
+ file_content: str,
30
+ model,
31
+ ) -> Optional[str]:
32
+ """
33
+ Use an LLM to filter a CI/CD file and extract only the relevant parts.
34
+
35
+ The LLM extracts parts related to:
36
+ - Installing the repository from source code on Linux
37
+ - Running test cases
38
+
39
+ Args:
40
+ file_path: Path to the CI/CD file (for context)
41
+ file_content: Raw content of the CI/CD file
42
+ model: The model instance to use for filtering
43
+
44
+ Returns:
45
+ Filtered content with only relevant parts, or None if the file is not relevant
46
+ """
47
+ if not file_content or not file_content.strip():
48
+ return None
49
+
50
+ if model is None:
51
+ _LOG.warning("No model provided for LLM filtering, falling back to heuristic extraction")
52
+ return None
53
+
54
+ prompt = f"""You are helping with a specific task: INSTALLING A SOFTWARE PROJECT FROM SOURCE CODE AND RUNNING ITS TEST SUITE inside a fresh Linux Docker container.
55
+
56
+ You are analyzing a CI/CD configuration file to extract ONLY the parts that would help accomplish this task.
57
+
58
+ FILE PATH: {file_path}
59
+
60
+ FILE CONTENT:
61
+ ```
62
+ {file_content[:50000]}
63
+ ```
64
+
65
+ CONTEXT - THE OVERALL GOAL:
66
+ We need to:
67
+ 1. Clone a repository into a fresh Docker container (Ubuntu-based)
68
+ 2. Install all necessary system dependencies and language runtimes
69
+ 3. Build/install the project from source code
70
+ 4. Run the project's test suite successfully
71
+
72
+ YOUR TASK:
73
+ 1. First, determine if this CI/CD file contains information useful for the above goal.
74
+
75
+ A file IS RELEVANT if it contains:
76
+ - Commands to install system packages or language runtimes needed to build/run the project
77
+ - Commands to install project dependencies (pip install, npm install, cargo build, etc.)
78
+ - Commands to build or compile the project from source
79
+ - Commands to run tests (pytest, npm test, cargo test, make test, etc.)
80
+ - Environment setup needed before building/testing (env vars, services like databases)
81
+ - Container/Docker image specifications that show what base image or packages are needed
82
+
83
+ A file is NOT RELEVANT if it ONLY contains:
84
+ - Documentation building/deployment (mkdocs, sphinx, readthedocs, GitHub Pages)
85
+ - Code formatting/linting checks only (black, flake8, prettier, eslint) without build/test
86
+ - Release/publishing workflows (PyPI uploads, npm publish, Docker Hub push)
87
+ - Badge/status updates, notifications
88
+ - Security scanning without actual build/test steps
89
+ - Production/staging deployment (we only care about building and testing locally)
90
+
91
+ 2. If the file is NOT relevant to building from source and running tests, respond with exactly: NOT_RELEVANT
92
+
93
+ 3. If the file IS relevant, extract and output ONLY the parts useful for our goal:
94
+ - Shell commands (run:) that install dependencies, build the project, or run tests
95
+ - The specific test commands used (this is crucial - we need to know how to run tests)
96
+ - Required system packages and their installation commands
97
+ - Language/runtime version requirements (Python version, Node version, etc.)
98
+ - Environment variables needed for building or testing
99
+ - Container images or services (databases, redis, etc.) required for tests
100
+ - Any special setup steps or scripts that must run before tests
101
+
102
+ REMOVE (not useful for our goal):
103
+ - Trigger configurations (on:, push:, pull_request:, schedule:)
104
+ - Permissions blocks
105
+ - Caching configurations (we'll run fresh each time)
106
+ - Artifact upload/download (CI-specific)
107
+ - Secret/token references (we won't have these)
108
+ - GitHub-specific output commands
109
+ - Matrix configurations (just note "runs on multiple Python versions" if present)
110
+ - Jobs related to docs, linting-only, releases, or deployments
111
+
112
+ Output the extracted content in a clean, readable format. PRESERVE SHELL COMMANDS EXACTLY as written - these are the most valuable information for reproducing the build and test process."""
113
+
114
+ try:
115
+ response = model.query([{"role": "user", "content": prompt}])
116
+ result = response.get("content", "").strip()
117
+
118
+ if not result or result == "NOT_RELEVANT":
119
+ _LOG.info(f"LLM determined '{file_path}' is not relevant for build/test")
120
+ return None
121
+
122
+ _LOG.info(f"LLM filtered '{file_path}' - extracted relevant content")
123
+ return result
124
+
125
+ except Exception as e:
126
+ _LOG.warning(f"LLM filtering failed for '{file_path}': {e}, falling back to heuristic extraction")
127
+ return None
128
+
129
+
130
+ def _extract_relevant_workflow_parts(workflow_content: str) -> str:
131
+ """
132
+ Extract only the relevant parts from a GitHub Actions workflow YAML.
133
+
134
+ Keeps:
135
+ - Job names and their 'runs-on' values
136
+ - 'run:' commands (the actual shell commands)
137
+ - 'uses:' actions that are relevant (setup-*, install-*, build-*, test-*)
138
+ - Environment variables that might be relevant
139
+ - Container/services definitions
140
+
141
+ Removes:
142
+ - Trigger configurations (on:, push:, pull_request:, schedule:, etc.)
143
+ - Permissions blocks
144
+ - Concurrency settings
145
+ - Most 'with:' blocks (except for relevant setup actions)
146
+ - Matrix configurations (just keeps the concept, not all variants)
147
+ - Caching configurations
148
+ - Artifact upload/download details
149
+ - GitHub-specific tokens and secrets references
150
+ """
151
+ if not workflow_content or not workflow_content.strip():
152
+ return ""
153
+
154
+ # If yaml is not available, do basic text filtering
155
+ if yaml is None:
156
+ return _extract_workflow_parts_regex(workflow_content)
157
+
158
+ try:
159
+ data = yaml.safe_load(workflow_content)
160
+ if not isinstance(data, dict):
161
+ return _extract_workflow_parts_regex(workflow_content)
162
+ return _extract_workflow_parts_structured(data)
163
+ except Exception:
164
+ return _extract_workflow_parts_regex(workflow_content)
165
+
166
+
167
+ def _extract_workflow_parts_structured(data: Dict[str, Any]) -> str:
168
+ """Extract relevant parts from parsed YAML workflow."""
169
+ lines = []
170
+
171
+ # Extract workflow name
172
+ if data.get("name"):
173
+ lines.append(f"Workflow: {data['name']}")
174
+
175
+ # Extract environment variables at workflow level
176
+ if data.get("env"):
177
+ env_vars = data["env"]
178
+ relevant_env = {k: v for k, v in env_vars.items()
179
+ if not any(secret in str(v).upper() for secret in ["SECRET", "TOKEN", "KEY", "PASSWORD"])}
180
+ if relevant_env:
181
+ lines.append(f"Environment: {relevant_env}")
182
+
183
+ jobs = data.get("jobs", {})
184
+ if not jobs:
185
+ return "\n".join(lines)
186
+
187
+ for job_name, job_data in jobs.items():
188
+ if not isinstance(job_data, dict):
189
+ continue
190
+
191
+ lines.append(f"\nJob: {job_name}")
192
+
193
+ # Runs-on is useful to know the target OS
194
+ if job_data.get("runs-on"):
195
+ runs_on = job_data["runs-on"]
196
+ # Simplify matrix expressions
197
+ if isinstance(runs_on, str) and "${{" in runs_on:
198
+ runs_on = "linux/macos/windows (matrix)"
199
+ lines.append(f" runs-on: {runs_on}")
200
+
201
+ # Container info is very relevant
202
+ if job_data.get("container"):
203
+ container = job_data["container"]
204
+ if isinstance(container, str):
205
+ lines.append(f" container: {container}")
206
+ elif isinstance(container, dict) and container.get("image"):
207
+ lines.append(f" container: {container['image']}")
208
+
209
+ # Services (like databases) are relevant
210
+ if job_data.get("services"):
211
+ services = job_data["services"]
212
+ service_names = list(services.keys()) if isinstance(services, dict) else []
213
+ if service_names:
214
+ lines.append(f" services: {', '.join(service_names)}")
215
+
216
+ # Extract steps
217
+ steps = job_data.get("steps", [])
218
+ if not steps:
219
+ continue
220
+
221
+ lines.append(" steps:")
222
+ for step in steps:
223
+ if not isinstance(step, dict):
224
+ continue
225
+
226
+ step_name = step.get("name", "")
227
+
228
+ # Handle 'uses:' actions
229
+ if step.get("uses"):
230
+ uses = step["uses"]
231
+ # Only include relevant setup/build/test actions
232
+ relevant_actions = ["setup-", "install-", "build-", "test-", "python", "node", "java", "go", "rust", "docker"]
233
+ if any(action in uses.lower() for action in relevant_actions):
234
+ action_line = f" - uses: {uses}"
235
+ # Include relevant 'with' parameters for setup actions
236
+ if step.get("with"):
237
+ with_data = step["with"]
238
+ # Filter out tokens, keys, etc.
239
+ relevant_with = {}
240
+ for k, v in with_data.items():
241
+ k_lower = k.lower()
242
+ v_str = str(v).upper()
243
+ if any(secret in k_lower or secret in v_str for secret in ["token", "secret", "key", "password", "credential"]):
244
+ continue
245
+ if k_lower in ["version", "python-version", "node-version", "java-version", "go-version", "architecture"]:
246
+ relevant_with[k] = v
247
+ if relevant_with:
248
+ action_line += f" (with: {relevant_with})"
249
+ lines.append(action_line)
250
+
251
+ # Handle 'run:' commands - these are the most valuable
252
+ if step.get("run"):
253
+ run_cmd = step["run"].strip()
254
+ # Skip if it's just echoing or setting outputs
255
+ if run_cmd.startswith("echo ") and "GITHUB_OUTPUT" in run_cmd:
256
+ continue
257
+ if "::set-output" in run_cmd or "::add-mask" in run_cmd:
258
+ continue
259
+
260
+ if step_name:
261
+ lines.append(f" - {step_name}:")
262
+ lines.append(f" run: |")
263
+ for cmd_line in run_cmd.split("\n"):
264
+ cmd_line = cmd_line.strip()
265
+ if cmd_line:
266
+ lines.append(f" {cmd_line}")
267
+
268
+ return "\n".join(lines)
269
+
270
+
271
+ def _extract_workflow_parts_regex(workflow_content: str) -> str:
272
+ """Fallback regex-based extraction when YAML parsing is unavailable."""
273
+ lines = []
274
+ in_run_block = False
275
+ run_indent = 0
276
+
277
+ # Patterns to skip
278
+ skip_patterns = [
279
+ r"^\s*on:\s*$",
280
+ r"^\s*push:\s*$",
281
+ r"^\s*pull_request",
282
+ r"^\s*schedule:",
283
+ r"^\s*workflow_dispatch",
284
+ r"^\s*permissions:",
285
+ r"^\s*concurrency:",
286
+ r"^\s*branches:",
287
+ r"^\s*tags:",
288
+ r"^\s*paths:",
289
+ r"^\s*types:",
290
+ r"^\s*cron:",
291
+ r"^\s*#", # Comments
292
+ r"\$\{\{\s*secrets\.", # Secret references
293
+ r"\$\{\{\s*github\.token",
294
+ ]
295
+
296
+ # Patterns to keep
297
+ keep_patterns = [
298
+ r"^\s*name:",
299
+ r"^\s*jobs:",
300
+ r"^\s*runs-on:",
301
+ r"^\s*container:",
302
+ r"^\s*services:",
303
+ r"^\s*steps:",
304
+ r"^\s*run:\s*[|>]?\s*$",
305
+ r"^\s*run:\s+\S",
306
+ r"^\s*-\s+name:",
307
+ r"^\s*uses:\s+.*(setup|install|build|test|python|node|java|docker)",
308
+ r"^\s*env:",
309
+ r"^\s*(python|node|java|go)-version:",
310
+ ]
311
+
312
+ for line in workflow_content.split("\n"):
313
+ # Check if we should skip this line
314
+ should_skip = any(re.match(pattern, line, re.IGNORECASE) for pattern in skip_patterns)
315
+ if should_skip and not in_run_block:
316
+ continue
317
+
318
+ # Track run blocks
319
+ if re.match(r"^\s*run:\s*[|>]?\s*$", line):
320
+ in_run_block = True
321
+ run_indent = len(line) - len(line.lstrip())
322
+ lines.append(line)
323
+ continue
324
+
325
+ if in_run_block:
326
+ current_indent = len(line) - len(line.lstrip()) if line.strip() else run_indent + 1
327
+ if current_indent > run_indent or not line.strip():
328
+ lines.append(line)
329
+ continue
330
+ else:
331
+ in_run_block = False
332
+
333
+ # Check if we should keep this line
334
+ should_keep = any(re.match(pattern, line, re.IGNORECASE) for pattern in keep_patterns)
335
+ if should_keep:
336
+ lines.append(line)
337
+
338
+ return "\n".join(lines)
339
+
340
+ @dataclass
341
+ class RepoContext:
342
+ project_path: str
343
+ project_url: str
344
+ language: str
345
+ workflows: List[str]
346
+ workflow_contents: List[Tuple[str, str]] # (path, content)
347
+ dockerfiles: List[str]
348
+ dockerfile_contents: List[Tuple[str, str]] # (path, content)
349
+ search_results: List[Dict[str, Any]] # loaded from cache or produced elsewhere
350
+ requirement_files: List[Tuple[str, str]] = None # (path, content) - dependency/requirement files
351
+ readme_content: Optional[str] = None # README content (installation/build instructions)
352
+ unified_summary: Optional[str] = None
353
+ problems_memory: Optional[str] = None
354
+ local_repo_available: bool = False # True if repo was cloned locally during preprocessing
355
+
356
+ def __post_init__(self):
357
+ if self.requirement_files is None:
358
+ self.requirement_files = []
359
+
360
+ class ContextBuilder:
361
+ KEYWORDS = ["test", "build", "linux", "unittest", "integration", "deploy"]
362
+
363
+ def __init__(
364
+ self,
365
+ *,
366
+ workspace_root: str = "execution_agent_workspace",
367
+ search_logs_root: str = "search_logs",
368
+ problems_memory_root: str = "problems_memory",
369
+ ):
370
+ self.workspace_root = workspace_root
371
+ self.search_logs_root = search_logs_root
372
+ self.problems_memory_root = problems_memory_root
373
+
374
+ def _shorten_path(self, full_path: str, project_name: str) -> str:
375
+ """
376
+ Shorten an absolute path to just workspace/project/relative_path.
377
+
378
+ For example:
379
+ /home/user/mini_execution_agent/execution_agent_workspace/pandas/.github/workflows/test.yml
380
+ becomes:
381
+ execution_agent_workspace/pandas/.github/workflows/test.yml
382
+ """
383
+ # Try to find the workspace_root in the path and extract from there
384
+ workspace_base = os.path.basename(self.workspace_root.rstrip(os.sep))
385
+
386
+ # Find where workspace_root appears in the path
387
+ try:
388
+ # Get the absolute workspace root for comparison
389
+ abs_workspace = os.path.abspath(self.workspace_root)
390
+ abs_path = os.path.abspath(full_path)
391
+
392
+ if abs_path.startswith(abs_workspace):
393
+ # Extract the relative part after workspace_root
394
+ rel_path = os.path.relpath(abs_path, os.path.dirname(abs_workspace))
395
+ return rel_path
396
+ except (ValueError, OSError):
397
+ pass
398
+
399
+ # Fallback: try to find workspace_base/project_name pattern
400
+ pattern = os.path.join(workspace_base, project_name)
401
+ idx = full_path.find(pattern)
402
+ if idx != -1:
403
+ return full_path[idx:]
404
+
405
+ # Last resort: just return the original path
406
+ return full_path
407
+
408
+ # ---------- faithful file discovery ----------
409
+ def find_workflows(self, project_name: str, filter_by_keywords: bool = False) -> List[str]:
410
+ """
411
+ Find workflow files in the project's .github/workflows directory.
412
+
413
+ Args:
414
+ project_name: Name of the project (subdirectory in workspace_root)
415
+ filter_by_keywords: If True, only return files with keywords in filename.
416
+ If False (default), return all workflow files and let
417
+ LLM filtering determine relevance later.
418
+
419
+ Returns:
420
+ List of paths to workflow YAML files
421
+ """
422
+ found_files: List[str] = []
423
+ workflow_dir = os.path.join(self.workspace_root, project_name, ".github", "workflows")
424
+ if not os.path.isdir(workflow_dir):
425
+ return []
426
+
427
+ result = subprocess.run(
428
+ ["find", workflow_dir, "-name", "*.yml", "-o", "-name", "*.yaml"],
429
+ stdout=subprocess.PIPE,
430
+ stderr=subprocess.PIPE,
431
+ text=True,
432
+ )
433
+ if result.returncode != 0:
434
+ return []
435
+
436
+ for path in result.stdout.splitlines():
437
+ if not path.strip():
438
+ continue
439
+ if filter_by_keywords:
440
+ filename = os.path.basename(path).lower()
441
+ if any(k in filename for k in self.KEYWORDS):
442
+ found_files.append(path)
443
+ else:
444
+ found_files.append(path)
445
+ return found_files
446
+
447
+ def find_dockerfiles(self, project_path: str) -> List[str]:
448
+ proj_dir = os.path.join(self.workspace_root, project_path)
449
+ result = subprocess.run(
450
+ ["find", proj_dir, "-name", "Dockerfile"],
451
+ stdout=subprocess.PIPE,
452
+ stderr=subprocess.PIPE,
453
+ text=True,
454
+ )
455
+ if result.returncode != 0:
456
+ return []
457
+ return result.stdout.splitlines()
458
+
459
+ # ---------- requirement files discovery (language-dependent) ----------
460
+
461
+ # Language-specific requirement file patterns
462
+ REQUIREMENT_FILE_PATTERNS: Dict[str, List[str]] = {
463
+ # Python patterns
464
+ "python": [
465
+ "requirements.txt", "requirements*.txt", "requirements/*.txt",
466
+ "setup.py", "setup.cfg", "pyproject.toml",
467
+ "Pipfile", "Pipfile.lock", "poetry.lock",
468
+ "environment.yml", "environment.yaml", "conda.yml", "conda.yaml",
469
+ "tox.ini", ".python-version",
470
+ ],
471
+ # JavaScript/TypeScript patterns
472
+ "javascript": [
473
+ "package.json", "package-lock.json", "yarn.lock", "pnpm-lock.yaml",
474
+ "bun.lockb", ".nvmrc", ".node-version",
475
+ ],
476
+ "typescript": [
477
+ "package.json", "package-lock.json", "yarn.lock", "pnpm-lock.yaml",
478
+ "bun.lockb", ".nvmrc", ".node-version", "tsconfig.json",
479
+ ],
480
+ # Java/JVM patterns
481
+ "java": [
482
+ "pom.xml", "build.gradle", "build.gradle.kts", "settings.gradle",
483
+ "gradle.properties", ".java-version", "gradlew",
484
+ ],
485
+ "kotlin": [
486
+ "pom.xml", "build.gradle", "build.gradle.kts", "settings.gradle",
487
+ "gradle.properties",
488
+ ],
489
+ "scala": [
490
+ "build.sbt", "pom.xml", "build.gradle",
491
+ ],
492
+ # C/C++ patterns
493
+ "c": [
494
+ "CMakeLists.txt", "Makefile", "makefile", "GNUmakefile",
495
+ "configure", "configure.ac", "configure.in",
496
+ "meson.build", "conanfile.txt", "conanfile.py",
497
+ "vcpkg.json", "WORKSPACE", "BUILD", "BUILD.bazel",
498
+ ],
499
+ "cpp": [
500
+ "CMakeLists.txt", "Makefile", "makefile", "GNUmakefile",
501
+ "configure", "configure.ac", "configure.in",
502
+ "meson.build", "conanfile.txt", "conanfile.py",
503
+ "vcpkg.json", "WORKSPACE", "BUILD", "BUILD.bazel",
504
+ ],
505
+ "c++": [
506
+ "CMakeLists.txt", "Makefile", "makefile", "GNUmakefile",
507
+ "configure", "configure.ac", "configure.in",
508
+ "meson.build", "conanfile.txt", "conanfile.py",
509
+ "vcpkg.json", "WORKSPACE", "BUILD", "BUILD.bazel",
510
+ ],
511
+ # Rust patterns
512
+ "rust": [
513
+ "Cargo.toml", "Cargo.lock", "rust-toolchain", "rust-toolchain.toml",
514
+ ],
515
+ # Go patterns
516
+ "go": [
517
+ "go.mod", "go.sum", "Gopkg.toml", "Gopkg.lock", "glide.yaml",
518
+ ],
519
+ # Ruby patterns
520
+ "ruby": [
521
+ "Gemfile", "Gemfile.lock", ".ruby-version", ".ruby-gemset",
522
+ "*.gemspec",
523
+ ],
524
+ # PHP patterns
525
+ "php": [
526
+ "composer.json", "composer.lock",
527
+ ],
528
+ # .NET patterns
529
+ "csharp": [
530
+ "*.csproj", "*.sln", "packages.config", "nuget.config",
531
+ "Directory.Build.props", "global.json",
532
+ ],
533
+ "fsharp": [
534
+ "*.fsproj", "*.sln", "packages.config", "nuget.config",
535
+ ],
536
+ # Elixir patterns
537
+ "elixir": [
538
+ "mix.exs", "mix.lock",
539
+ ],
540
+ # Haskell patterns
541
+ "haskell": [
542
+ "stack.yaml", "cabal.project", "*.cabal", "package.yaml",
543
+ ],
544
+ }
545
+
546
+ # Common patterns across all languages (always check these)
547
+ COMMON_REQUIREMENT_PATTERNS: List[str] = [
548
+ "Makefile", "makefile", "GNUmakefile",
549
+ "Dockerfile", "docker-compose.yml", "docker-compose.yaml",
550
+ ".tool-versions", # asdf version manager
551
+ ]
552
+
553
+ def find_requirement_files(self, project_path: str, language: str) -> List[str]:
554
+ """
555
+ Find requirement/dependency files in the repository based on language.
556
+
557
+ This uses a heuristic approach:
558
+ 1. Look for language-specific requirement files
559
+ 2. Also look for common build system files
560
+ 3. Search at the root level and one level deep
561
+
562
+ Args:
563
+ project_path: Name of the project (subdirectory in workspace_root)
564
+ language: Primary language of the project (e.g., 'python', 'java', 'c')
565
+
566
+ Returns:
567
+ List of paths to requirement/dependency files
568
+ """
569
+ found_files: List[str] = []
570
+ proj_dir = os.path.join(self.workspace_root, project_path)
571
+
572
+ if not os.path.isdir(proj_dir):
573
+ return []
574
+
575
+ # Get language-specific patterns
576
+ lang_lower = language.lower().strip() if language else ""
577
+ patterns = list(self.COMMON_REQUIREMENT_PATTERNS)
578
+
579
+ # Add language-specific patterns
580
+ if lang_lower in self.REQUIREMENT_FILE_PATTERNS:
581
+ patterns.extend(self.REQUIREMENT_FILE_PATTERNS[lang_lower])
582
+ else:
583
+ # If language is unknown, check for common ones across multiple languages
584
+ for lang_patterns in self.REQUIREMENT_FILE_PATTERNS.values():
585
+ patterns.extend(lang_patterns)
586
+ patterns = list(set(patterns)) # Remove duplicates
587
+
588
+ # Search for files matching patterns
589
+ for pattern in patterns:
590
+ # Handle glob patterns (files with *)
591
+ if "*" in pattern:
592
+ # Use find with -name for glob patterns
593
+ try:
594
+ result = subprocess.run(
595
+ ["find", proj_dir, "-maxdepth", "2", "-name", pattern, "-type", "f"],
596
+ stdout=subprocess.PIPE,
597
+ stderr=subprocess.PIPE,
598
+ text=True,
599
+ timeout=30,
600
+ )
601
+ if result.returncode == 0:
602
+ for path in result.stdout.splitlines():
603
+ if path.strip() and path not in found_files:
604
+ found_files.append(path.strip())
605
+ except Exception:
606
+ pass
607
+ else:
608
+ # Check for exact filename at root and one level deep
609
+ root_path = os.path.join(proj_dir, pattern)
610
+ if os.path.isfile(root_path) and root_path not in found_files:
611
+ found_files.append(root_path)
612
+
613
+ # Check one level deep (common subdirs)
614
+ for subdir in ["", "src", "lib", "pkg", "config", "build"]:
615
+ if subdir:
616
+ check_path = os.path.join(proj_dir, subdir, pattern)
617
+ else:
618
+ continue # Already checked root
619
+ if os.path.isfile(check_path) and check_path not in found_files:
620
+ found_files.append(check_path)
621
+
622
+ return found_files
623
+
624
+ def find_readme(self, project_path: str) -> Optional[str]:
625
+ """
626
+ Find the main README file in the repository.
627
+
628
+ Looks for common README file names at the root level.
629
+
630
+ Args:
631
+ project_path: Name of the project (subdirectory in workspace_root)
632
+
633
+ Returns:
634
+ Path to the README file if found, None otherwise
635
+ """
636
+ proj_dir = os.path.join(self.workspace_root, project_path)
637
+
638
+ if not os.path.isdir(proj_dir):
639
+ return None
640
+
641
+ # Common README file names (in priority order)
642
+ readme_names = [
643
+ "README.md", "README.rst", "README.txt", "README",
644
+ "readme.md", "readme.rst", "readme.txt", "readme",
645
+ "Readme.md", "Readme.rst", "Readme.txt", "Readme",
646
+ "INSTALL.md", "INSTALL.txt", "INSTALL",
647
+ "CONTRIBUTING.md", "BUILDING.md",
648
+ ]
649
+
650
+ for name in readme_names:
651
+ path = os.path.join(proj_dir, name)
652
+ if os.path.isfile(path):
653
+ return path
654
+
655
+ return None
656
+
657
+ def load_requirement_files(
658
+ self,
659
+ requirement_paths: List[str],
660
+ project_name: str,
661
+ max_file_size: int = 50_000,
662
+ ) -> List[Tuple[str, str]]:
663
+ """
664
+ Load requirement/dependency file contents.
665
+
666
+ Args:
667
+ requirement_paths: List of paths to requirement files
668
+ project_name: Name of the project (for shortening paths in output)
669
+ max_file_size: Maximum file size to read (bytes)
670
+
671
+ Returns:
672
+ List of (shortened_path, content) tuples
673
+ """
674
+ out = []
675
+ for p in requirement_paths:
676
+ # Skip very large files (like lock files)
677
+ try:
678
+ file_size = os.path.getsize(p)
679
+ if file_size > max_file_size:
680
+ _LOG.debug(f"Skipping large file {p} ({file_size} bytes)")
681
+ # Still include a note about the file
682
+ short_path = self._shorten_path(p, project_name)
683
+ out.append((short_path, f"[File too large: {file_size} bytes. This is likely a lock file.]"))
684
+ continue
685
+ except Exception:
686
+ pass
687
+
688
+ short_path = self._shorten_path(p, project_name)
689
+ content = self._read_text_file(p, max_chars=max_file_size)
690
+ if content:
691
+ out.append((short_path, content))
692
+
693
+ return out
694
+
695
+ def load_readme_content(
696
+ self,
697
+ readme_path: Optional[str],
698
+ project_name: str,
699
+ max_chars: int = 30_000,
700
+ ) -> Optional[str]:
701
+ """
702
+ Load and extract relevant sections from README file.
703
+
704
+ Focuses on sections related to:
705
+ - Installation
706
+ - Building from source
707
+ - Running tests
708
+ - Dependencies/Requirements
709
+
710
+ Args:
711
+ readme_path: Path to the README file
712
+ project_name: Name of the project (for logging)
713
+ max_chars: Maximum characters to read
714
+
715
+ Returns:
716
+ Extracted README content or None if not found
717
+ """
718
+ if not readme_path or not os.path.isfile(readme_path):
719
+ return None
720
+
721
+ content = self._read_text_file(readme_path, max_chars=max_chars * 2) # Read more, then filter
722
+ if not content:
723
+ return None
724
+
725
+ # Keywords that indicate relevant sections
726
+ relevant_keywords = [
727
+ "install", "setup", "getting started", "build", "compile",
728
+ "test", "testing", "development", "contributing", "requirements",
729
+ "dependencies", "prerequisite", "quick start", "usage",
730
+ "from source", "docker", "container", "environment",
731
+ ]
732
+
733
+ # Try to extract relevant sections
734
+ lines = content.split('\n')
735
+ relevant_sections = []
736
+ in_relevant_section = False
737
+ current_section = []
738
+ section_header_level = 0
739
+
740
+ for line in lines:
741
+ # Check if this is a header (markdown # or underlined)
742
+ is_header = False
743
+ header_level = 0
744
+
745
+ if line.startswith('#'):
746
+ is_header = True
747
+ header_level = len(line) - len(line.lstrip('#'))
748
+ elif len(lines) > 1 and lines.index(line) < len(lines) - 1:
749
+ next_line = lines[lines.index(line) + 1] if lines.index(line) + 1 < len(lines) else ""
750
+ if next_line and (set(next_line.strip()) == {'='} or set(next_line.strip()) == {'-'}):
751
+ is_header = True
752
+ header_level = 1 if '=' in next_line else 2
753
+
754
+ if is_header:
755
+ # Save previous section if it was relevant
756
+ if in_relevant_section and current_section:
757
+ relevant_sections.append('\n'.join(current_section))
758
+
759
+ # Check if new section is relevant
760
+ line_lower = line.lower()
761
+ in_relevant_section = any(kw in line_lower for kw in relevant_keywords)
762
+ current_section = [line] if in_relevant_section else []
763
+ section_header_level = header_level
764
+
765
+ elif in_relevant_section:
766
+ current_section.append(line)
767
+
768
+ # Don't forget the last section
769
+ if in_relevant_section and current_section:
770
+ relevant_sections.append('\n'.join(current_section))
771
+
772
+ if relevant_sections:
773
+ extracted = '\n\n'.join(relevant_sections)
774
+ if len(extracted) > max_chars:
775
+ extracted = extracted[:max_chars] + "\n... [README content truncated]"
776
+ return extracted
777
+
778
+ # If no relevant sections found, return the first part of the README
779
+ if len(content) > max_chars:
780
+ content = content[:max_chars] + "\n... [README content truncated]"
781
+ return content
782
+
783
+ # ---------- content loaders ----------
784
+ def _read_text_file(self, path: str, max_chars: int = 200_000) -> str:
785
+ try:
786
+ with open(path, "r", errors="ignore") as f:
787
+ data = f.read()
788
+ return data[:max_chars]
789
+ except Exception:
790
+ return ""
791
+
792
+ def load_workflow_contents(
793
+ self,
794
+ workflow_paths: List[str],
795
+ project_name: str,
796
+ filter_relevant: bool = True,
797
+ model=None,
798
+ use_llm_filter: bool = True,
799
+ ) -> List[Tuple[str, str]]:
800
+ """
801
+ Load workflow file contents.
802
+
803
+ Args:
804
+ workflow_paths: List of paths to workflow YAML files
805
+ project_name: Name of the project (for shortening paths in output)
806
+ filter_relevant: If True, extract only relevant parts (build/test commands)
807
+ and remove CI/CD platform-specific configuration
808
+ model: Optional model instance for LLM-based filtering
809
+ use_llm_filter: If True and model is provided, use LLM to filter and extract
810
+ only relevant CI/CD files (those related to installing from
811
+ source and running tests on Linux)
812
+
813
+ Returns:
814
+ List of (shortened_path, content) tuples (only relevant files if LLM filtering is enabled)
815
+ """
816
+ out = []
817
+ for p in workflow_paths:
818
+ raw_content = self._read_text_file(p)
819
+ if not raw_content:
820
+ continue
821
+
822
+ # Shorten the path for output
823
+ short_path = self._shorten_path(p, project_name)
824
+
825
+ # Try LLM filtering first if enabled and model is available
826
+ if use_llm_filter and model is not None:
827
+ llm_filtered = _llm_filter_cicd_file(short_path, raw_content, model)
828
+ if llm_filtered:
829
+ out.append((short_path, llm_filtered))
830
+ # If LLM returns None, the file is not relevant - skip it
831
+ continue
832
+
833
+ # Fall back to heuristic filtering if LLM not available
834
+ if filter_relevant:
835
+ filtered = _extract_relevant_workflow_parts(raw_content)
836
+ # Only include if we extracted something meaningful
837
+ if filtered and filtered.strip():
838
+ out.append((short_path, filtered))
839
+ else:
840
+ # Fall back to raw if filtering removed everything
841
+ out.append((short_path, raw_content))
842
+ else:
843
+ out.append((short_path, raw_content))
844
+ return out
845
+
846
+ def load_dockerfile_contents(self, dockerfile_paths: List[str], project_name: str) -> List[Tuple[str, str]]:
847
+ """
848
+ Load Dockerfile contents.
849
+
850
+ Args:
851
+ dockerfile_paths: List of paths to Dockerfiles
852
+ project_name: Name of the project (for shortening paths in output)
853
+
854
+ Returns:
855
+ List of (shortened_path, content) tuples
856
+ """
857
+ out = []
858
+ for p in dockerfile_paths:
859
+ short_path = self._shorten_path(p, project_name)
860
+ out.append((short_path, self._read_text_file(p)))
861
+ return out
862
+
863
+ def load_problems_memory(self, project_path: str) -> Optional[str]:
864
+ # replicates your old "problems_memory/<project_path>" behavior
865
+ p = os.path.join(self.problems_memory_root, project_path)
866
+ if not os.path.exists(p):
867
+ return None
868
+ txt = self._read_text_file(p, max_chars=80_000)
869
+ return txt if txt.strip() else None
870
+
871
+ # ---------- repository cloning ----------
872
+ def clone_repo(self, project_path: str, project_url: str) -> bool:
873
+ """
874
+ Clone the repository into workspace_root/project_path if not already present.
875
+ Returns True if the repo is available (either already existed or was cloned successfully).
876
+ """
877
+ target_dir = os.path.join(self.workspace_root, project_path)
878
+
879
+ # Check if already cloned (has .git directory)
880
+ if os.path.isdir(os.path.join(target_dir, ".git")):
881
+ _LOG.info(f"Repository already exists at {target_dir}")
882
+ return True
883
+
884
+ # Create parent directory if needed
885
+ os.makedirs(self.workspace_root, exist_ok=True)
886
+
887
+ # Clone the repository
888
+ _LOG.info(f"Cloning repository {project_url} to {target_dir}...")
889
+ try:
890
+ result = subprocess.run(
891
+ ["git", "clone", "--depth", "1", project_url, target_dir],
892
+ stdout=subprocess.PIPE,
893
+ stderr=subprocess.PIPE,
894
+ text=True,
895
+ timeout=300, # 5 minute timeout for large repos
896
+ )
897
+ if result.returncode == 0:
898
+ _LOG.info(f"Successfully cloned {project_url} to {target_dir}")
899
+ return True
900
+ else:
901
+ _LOG.warning(f"Failed to clone {project_url}: {result.stderr}")
902
+ return False
903
+ except subprocess.TimeoutExpired:
904
+ _LOG.warning(f"Cloning {project_url} timed out after 5 minutes")
905
+ return False
906
+ except Exception as e:
907
+ _LOG.warning(f"Failed to clone {project_url}: {e}")
908
+ return False
909
+
910
+ # ---------- web search cache loader ----------
911
+ def load_cached_search_results(self, project_id: str) -> List[Dict[str, Any]]:
912
+ """
913
+ Old code reads:
914
+ search_logs/<project_id>/<project_id>_build_install_from_source.json
915
+ but your “extra search doc code” saves:
916
+ search_logs/<project_id>/<query>.json
917
+ Here we support both conventions.
918
+ """
919
+ folder = os.path.join(self.search_logs_root, project_id)
920
+ if not os.path.isdir(folder):
921
+ return []
922
+
923
+ preferred = os.path.join(folder, f"{project_id}_build_install_from_source.json")
924
+ if os.path.exists(preferred):
925
+ try:
926
+ with open(preferred, "r") as f:
927
+ return json.loads(f.read())
928
+ except Exception:
929
+ return []
930
+
931
+ # fallback: load any json in folder (best-effort)
932
+ results: List[Dict[str, Any]] = []
933
+ for fn in sorted(os.listdir(folder)):
934
+ if not fn.endswith(".json"):
935
+ continue
936
+ try:
937
+ with open(os.path.join(folder, fn), "r") as f:
938
+ results.extend(json.loads(f.read()))
939
+ except Exception:
940
+ continue
941
+ return results
942
+
943
+ # ---------- web search functionality ----------
944
+ def _duckduckgo_search(self, query: str, max_results: int = 10) -> List[Dict[str, str]]:
945
+ """
946
+ Perform a DuckDuckGo search and return a list of {url, title, snippet} dicts.
947
+
948
+ Reliability improvements:
949
+ - requests.Session() + more complete headers (cookies / consistency)
950
+ - Prefer POST to html endpoint (often more stable for scraping)
951
+ - Fallback to GET html endpoint, then lite endpoint
952
+ - More tolerant parsing (BeautifulSoup if installed; regex fallback otherwise)
953
+ - Detect likely bot-check / interstitial HTML and warn clearly
954
+ - Better redirect URL cleanup (uddg=) + HTML entity unescape
955
+ """
956
+ if requests is None:
957
+ _LOG.warning("requests library not available, skipping web search")
958
+ return []
959
+
960
+ import time
961
+ import re
962
+ import html as htmllib
963
+ from urllib.parse import quote_plus, unquote, parse_qs, urlparse
964
+
965
+ def _extract_actual_url(href: str) -> str:
966
+ """Best-effort cleanup of DDG redirect URLs and normalization."""
967
+ if not href:
968
+ return href
969
+
970
+ # Some links may be protocol-relative
971
+ if href.startswith("//"):
972
+ href = "https:" + href
973
+
974
+ # DDG often wraps URLs in a redirect containing uddg=
975
+ if "uddg=" in href:
976
+ try:
977
+ parsed = urlparse(href)
978
+ actual_url = parse_qs(parsed.query).get("uddg", [href])[0]
979
+ return unquote(actual_url)
980
+ except Exception:
981
+ return href
982
+
983
+ return href
984
+
985
+ def _looks_like_block_or_interstitial(page_text: str) -> bool:
986
+ """Heuristics for bot-check / interstitial pages that return 200 but no results."""
987
+ t = (page_text or "").lower()
988
+ # Keep this list broad; you want to detect, log, and bail quickly.
989
+ indicators = [
990
+ "captcha",
991
+ "unusual traffic",
992
+ "automated requests",
993
+ "sorry",
994
+ "verify you are a human",
995
+ "enable javascript",
996
+ "temporarily unavailable",
997
+ ]
998
+ return any(s in t for s in indicators)
999
+
1000
+ def _parse_results_bs4(page_text: str) -> List[Dict[str, str]]:
1001
+ """Parse results using BeautifulSoup. Raises ImportError if bs4 is not installed."""
1002
+ from bs4 import BeautifulSoup # type: ignore
1003
+
1004
+ soup = BeautifulSoup(page_text, "html.parser")
1005
+ results: List[Dict[str, str]] = []
1006
+
1007
+ # Primary selector for html.duckduckgo.com results
1008
+ anchors = soup.select("a.result__a")
1009
+ if anchors:
1010
+ for a in anchors:
1011
+ href = _extract_actual_url(a.get("href", "").strip())
1012
+ title = a.get_text(" ", strip=True)
1013
+
1014
+ # Try to find a nearby snippet inside the same result container
1015
+ snippet = ""
1016
+ container = a.find_parent(class_=re.compile(r"\bresult\b"))
1017
+ if container is not None:
1018
+ snip_el = container.select_one(".result__snippet")
1019
+ if snip_el is None:
1020
+ # Sometimes snippet class variants appear
1021
+ snip_el = container.select_one("[class*='result__snippet']")
1022
+ if snip_el is not None:
1023
+ snippet = snip_el.get_text(" ", strip=True)
1024
+
1025
+ if href and title:
1026
+ results.append({"url": href, "title": title, "snippet": snippet})
1027
+
1028
+ if len(results) >= max_results:
1029
+ break
1030
+
1031
+ # Fallback parsing for lite endpoint (structure differs)
1032
+ if not results:
1033
+ # Lite pages often have result links without result__a
1034
+ lite_candidates = soup.select("a.result-link")
1035
+ if not lite_candidates:
1036
+ # Heuristic fallback: keep only outbound links; exclude obvious DDG navigation
1037
+ for a in soup.find_all("a", href=True):
1038
+ h = a.get("href", "")
1039
+ if not h:
1040
+ continue
1041
+ if "duckduckgo.com" in h and "uddg=" not in h:
1042
+ continue
1043
+ if h.startswith("/"):
1044
+ continue
1045
+ text = a.get_text(" ", strip=True)
1046
+ if not text:
1047
+ continue
1048
+ lite_candidates.append(a)
1049
+
1050
+ for a in lite_candidates:
1051
+ href = _extract_actual_url(a.get("href", "").strip())
1052
+ title = a.get_text(" ", strip=True)
1053
+
1054
+ if href and title:
1055
+ results.append({"url": href, "title": title, "snippet": ""})
1056
+
1057
+ if len(results) >= max_results:
1058
+ break
1059
+
1060
+ return results
1061
+
1062
+ def _parse_results_regex(page_text: str) -> List[Dict[str, str]]:
1063
+ """Regex-based parser with looser matching (still more brittle than bs4)."""
1064
+ results: List[Dict[str, str]] = []
1065
+
1066
+ # Match result__a with class containing result__a (not exact match), allow nested tags in title.
1067
+ link_pattern = re.compile(
1068
+ r'<a[^>]*class="[^"]*\bresult__a\b[^"]*"[^>]*href="([^"]+)"[^>]*>(.*?)</a>',
1069
+ re.IGNORECASE | re.DOTALL,
1070
+ )
1071
+
1072
+ # Snippets can be <a>, <div>, or <span>, and class may include additional tokens.
1073
+ snippet_pattern = re.compile(
1074
+ r'<(?:a|div|span)[^>]*class="[^"]*\bresult__snippet\b[^"]*"[^>]*>(.*?)</(?:a|div|span)>',
1075
+ re.IGNORECASE | re.DOTALL,
1076
+ )
1077
+
1078
+ links = link_pattern.findall(page_text)
1079
+ snippets = snippet_pattern.findall(page_text)
1080
+
1081
+ def _strip_tags(s: str) -> str:
1082
+ s = re.sub(r"<[^>]+>", "", s)
1083
+ return htmllib.unescape(s).strip()
1084
+
1085
+ for i, (href, title_html) in enumerate(links[:max_results]):
1086
+ href = _extract_actual_url(htmllib.unescape(href))
1087
+ title = _strip_tags(title_html)
1088
+ snippet = _strip_tags(snippets[i]) if i < len(snippets) else ""
1089
+
1090
+ if href and title:
1091
+ results.append({"url": href, "title": title, "snippet": snippet})
1092
+
1093
+ # Lite fallback: pull any anchors that look like outbound results
1094
+ if not results:
1095
+ generic_anchor = re.compile(
1096
+ r'<a[^>]*href="([^"]+)"[^>]*>(.*?)</a>',
1097
+ re.IGNORECASE | re.DOTALL,
1098
+ )
1099
+ for href, text_html in generic_anchor.findall(page_text):
1100
+ href = htmllib.unescape(href)
1101
+ if "duckduckgo.com" in href and "uddg=" not in href:
1102
+ continue
1103
+ if href.startswith("/"):
1104
+ continue
1105
+ title = _strip_tags(text_html)
1106
+ href = _extract_actual_url(href)
1107
+ if href and title:
1108
+ results.append({"url": href, "title": title, "snippet": ""})
1109
+ if len(results) >= max_results:
1110
+ break
1111
+
1112
+ return results[:max_results]
1113
+
1114
+ # Use a session for cookies + consistent behavior
1115
+ session = requests.Session()
1116
+ session.headers.update(
1117
+ {
1118
+ "User-Agent": (
1119
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
1120
+ "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
1121
+ ),
1122
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
1123
+ "Accept-Language": "en-US,en;q=0.9",
1124
+ "Cache-Control": "no-cache",
1125
+ "Pragma": "no-cache",
1126
+ "DNT": "1",
1127
+ "Referer": "https://duckduckgo.com/",
1128
+ }
1129
+ )
1130
+
1131
+ # Try a few approaches: POST html, GET html, GET lite
1132
+ attempts = [
1133
+ ("POST", "https://html.duckduckgo.com/html/", {"q": query}),
1134
+ ("GET", f"https://html.duckduckgo.com/html/?q={quote_plus(query)}", None),
1135
+ ("GET", f"https://lite.duckduckgo.com/lite/?q={quote_plus(query)}", None),
1136
+ ]
1137
+
1138
+ last_error: Exception | None = None
1139
+
1140
+ for method, url, data in attempts:
1141
+ try:
1142
+ # Small backoff between endpoint fallbacks (helps if rate-limited)
1143
+ time.sleep(0.25)
1144
+
1145
+ if method == "POST":
1146
+ resp = session.post(url, data=data, timeout=30)
1147
+ else:
1148
+ resp = session.get(url, timeout=30)
1149
+
1150
+ # If rate-limited, try next endpoint
1151
+ if resp.status_code in (429, 503):
1152
+ _LOG.warning("DuckDuckGo responded with %s for %s; trying fallback", resp.status_code, url)
1153
+ continue
1154
+
1155
+ resp.raise_for_status()
1156
+ page_text = resp.text or ""
1157
+
1158
+ # Quick “blocked/interstitial” detection
1159
+ if _looks_like_block_or_interstitial(page_text):
1160
+ _LOG.warning(
1161
+ "DuckDuckGo returned a likely interstitial/bot-check page (status=%s, url=%s). "
1162
+ "This commonly yields 0 parsed results.",
1163
+ resp.status_code,
1164
+ resp.url,
1165
+ )
1166
+ # Try the next endpoint, because lite sometimes works when html is blocked (and vice versa)
1167
+ continue
1168
+
1169
+ # Parse results (prefer bs4 if installed)
1170
+ try:
1171
+ results = _parse_results_bs4(page_text)
1172
+ except ImportError:
1173
+ results = _parse_results_regex(page_text)
1174
+
1175
+ # Final sanity: if results are empty but page contains markers of results, log for visibility
1176
+ if not results and "result__a" in page_text:
1177
+ _LOG.warning(
1178
+ "DDG page appears to contain result markers but parser returned 0 results. "
1179
+ "HTML structure may have changed."
1180
+ )
1181
+
1182
+ if results:
1183
+ _LOG.info("DuckDuckGo search for %r returned %d results", query, len(results))
1184
+ return results[:max_results]
1185
+
1186
+ except Exception as e:
1187
+ last_error = e
1188
+ _LOG.warning("DuckDuckGo search attempt failed (%s %s): %s", method, url, e)
1189
+ continue
1190
+
1191
+ if last_error:
1192
+ _LOG.warning("DuckDuckGo search failed after fallbacks: %s", last_error)
1193
+
1194
+ return []
1195
+
1196
+ def _fetch_and_extract_page(self, url: str, max_chars: int = 15000) -> str:
1197
+ """
1198
+ Fetch a web page and extract its main text content.
1199
+ Returns empty string on failure.
1200
+ """
1201
+ if requests is None:
1202
+ return ""
1203
+
1204
+ try:
1205
+ headers = {
1206
+ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
1207
+ }
1208
+ resp = requests.get(url, headers=headers, timeout=20)
1209
+ resp.raise_for_status()
1210
+
1211
+ html = resp.text
1212
+
1213
+ # Simple HTML to text extraction
1214
+ import re
1215
+ # Remove script and style elements
1216
+ html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
1217
+ html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
1218
+ # Remove HTML tags
1219
+ text = re.sub(r'<[^>]+>', ' ', html)
1220
+ # Clean up whitespace
1221
+ text = re.sub(r'\s+', ' ', text).strip()
1222
+ # Decode HTML entities
1223
+ try:
1224
+ import html as html_module
1225
+ text = html_module.unescape(text)
1226
+ except Exception:
1227
+ pass
1228
+
1229
+ return text[:max_chars]
1230
+
1231
+ except Exception as e:
1232
+ _LOG.debug(f"Failed to fetch {url}: {e}")
1233
+ return ""
1234
+
1235
+ def perform_web_search(
1236
+ self,
1237
+ project_name: str,
1238
+ *,
1239
+ model=None,
1240
+ knowledge_model=None,
1241
+ max_results: int = 5,
1242
+ ) -> List[Dict[str, Any]]:
1243
+ """
1244
+ Perform a web search for 'how to install <project_name> on linux and run tests'.
1245
+ Fetches top results, extracts content, and optionally uses LLM to analyze/summarize each page.
1246
+
1247
+ Args:
1248
+ project_name: Name of the project to search for
1249
+ model: Default model (used if knowledge_model is not provided)
1250
+ knowledge_model: Separate model for web content analysis (e.g., gpt-5-mini).
1251
+ This model should be up-to-date and knowledgeable about
1252
+ current technologies, build systems, and best practices.
1253
+ max_results: Maximum number of search results to fetch
1254
+
1255
+ Returns list of {url, title, snippet, content, analysis} dicts.
1256
+ """
1257
+ # Use knowledge_model for analysis if provided, otherwise fall back to model
1258
+ analysis_model = knowledge_model if knowledge_model is not None else model
1259
+ query = f"how to install {project_name} on linux and run tests"
1260
+ _LOG.info(f"Performing web search: '{query}'")
1261
+
1262
+ search_results = self._duckduckgo_search(query, max_results=max_results)
1263
+ if not search_results:
1264
+ _LOG.warning(f"No search results found for '{query}'")
1265
+ return []
1266
+
1267
+ enriched_results: List[Dict[str, Any]] = []
1268
+
1269
+ for result in search_results:
1270
+ url = result.get("url", "")
1271
+ title = result.get("title", "")
1272
+ snippet = result.get("snippet", "")
1273
+
1274
+ # Fetch page content
1275
+ content = self._fetch_and_extract_page(url)
1276
+
1277
+ # Build result entry
1278
+ entry: Dict[str, Any] = {
1279
+ "url": url,
1280
+ "title": title,
1281
+ "snippet": snippet,
1282
+ "content": content,
1283
+ "analysis": "",
1284
+ }
1285
+
1286
+ # If analysis_model is provided, use it to analyze/summarize the page content
1287
+ if analysis_model is not None and content:
1288
+ try:
1289
+ analysis_prompt = f"""You are helping with a specific task: INSTALLING THE PROJECT '{project_name}' FROM SOURCE CODE AND RUNNING ITS TEST SUITE inside a fresh Linux (Ubuntu) Docker container.
1290
+
1291
+ Analyze this web page and extract ONLY information that helps accomplish this goal.
1292
+
1293
+ Web page URL: {url}
1294
+ Web page content:
1295
+ {content[:10000]}
1296
+
1297
+ WHAT WE NEED TO KNOW (extract if present):
1298
+ 1. SYSTEM DEPENDENCIES: What system packages need to be installed (apt-get install ...)?
1299
+ - Include specific package names, not just general descriptions
1300
+
1301
+ 2. LANGUAGE/RUNTIME REQUIREMENTS: What version of Python/Node/Java/etc. is required?
1302
+ - Specific version numbers are very helpful
1303
+
1304
+ 3. PROJECT DEPENDENCIES: How to install the project's dependencies?
1305
+ - Exact commands (pip install -e ., npm install, cargo build, etc.)
1306
+ - Any special flags or environment variables needed
1307
+
1308
+ 4. BUILD COMMANDS: How to build/compile the project from source?
1309
+ - Exact commands in order
1310
+ - Any configuration steps needed first
1311
+
1312
+ 5. TEST COMMANDS: How to run the test suite? (THIS IS CRUCIAL)
1313
+ - The exact command to run tests (pytest, npm test, make test, etc.)
1314
+ - Any test-specific setup required
1315
+ - How to run a subset of tests if the full suite takes too long
1316
+
1317
+ 6. COMMON ISSUES: Any known problems when building/testing and their solutions?
1318
+ - Missing dependencies that aren't documented
1319
+ - Platform-specific issues on Linux
1320
+ - Version conflicts and how to resolve them
1321
+
1322
+ 7. DOCKER/CONTAINER HINTS: Any Dockerfile examples or container setup instructions?
1323
+
1324
+ IMPORTANT:
1325
+ - Focus ONLY on building from source and running tests - ignore deployment, production setup, or usage documentation
1326
+ - Preserve exact commands as written - don't paraphrase shell commands
1327
+ - Note any prerequisites or assumptions the documentation makes
1328
+ - If information seems incomplete or might not work in a fresh container, mention what might be missing
1329
+
1330
+ Provide a concise, actionable summary focused on our goal of installing from source and running tests."""
1331
+
1332
+ resp = analysis_model.query([{"role": "user", "content": analysis_prompt}])
1333
+ entry["analysis"] = resp.get("content", "").strip()
1334
+ except Exception as e:
1335
+ _LOG.warning(f"Failed to analyze page {url}: {e}")
1336
+
1337
+ enriched_results.append(entry)
1338
+ time.sleep(0.5) # Be polite to servers
1339
+
1340
+ return enriched_results
1341
+
1342
+ def save_search_results(self, project_id: str, results: List[Dict[str, Any]]) -> None:
1343
+ """
1344
+ Save search results to cache for future use.
1345
+ """
1346
+ folder = os.path.join(self.search_logs_root, project_id)
1347
+ os.makedirs(folder, exist_ok=True)
1348
+
1349
+ cache_file = os.path.join(folder, f"{project_id}_build_install_from_source.json")
1350
+ try:
1351
+ with open(cache_file, "w") as f:
1352
+ json.dump(results, f, indent=2)
1353
+ _LOG.info(f"Saved {len(results)} search results to {cache_file}")
1354
+ except Exception as e:
1355
+ _LOG.warning(f"Failed to save search results: {e}")
1356
+
1357
+ # ---------- unified summary generator ----------
1358
+ def build_unified_summary(
1359
+ self,
1360
+ *,
1361
+ model,
1362
+ knowledge_model=None,
1363
+ search_workflows_summary_prompt: str,
1364
+ project_name: str,
1365
+ language: str,
1366
+ search_results: List[Dict[str, Any]],
1367
+ dockerfile_contents: List[Tuple[str, str]],
1368
+ requirement_files: List[Tuple[str, str]],
1369
+ readme_content: Optional[str] = None,
1370
+ workflow_contents: List[Tuple[str, str]] = None,
1371
+ cache_path: Optional[str] = None,
1372
+ ) -> Optional[str]:
1373
+ """
1374
+ Ask LLM to consolidate all available information into a structured "prompt section".
1375
+
1376
+ Args:
1377
+ model: Default model (used if knowledge_model is not provided)
1378
+ knowledge_model: Separate model for summary generation (e.g., gpt-5-mini).
1379
+ This model should be up-to-date and knowledgeable about
1380
+ current technologies, build systems, and best practices.
1381
+ search_workflows_summary_prompt: Prompt template for summary generation
1382
+ project_name: Name of the project
1383
+ language: Primary programming language of the project
1384
+ search_results: Web search results with analysis
1385
+ dockerfile_contents: List of (path, content) tuples for Dockerfiles
1386
+ requirement_files: List of (path, content) tuples for requirement/dependency files
1387
+ readme_content: Extracted README content (installation/build instructions)
1388
+ workflow_contents: List of (path, content) tuples for CI/CD workflow files
1389
+ cache_path: Optional path to cache the summary
1390
+ """
1391
+ # Use knowledge_model for summary if provided, otherwise fall back to model
1392
+ summary_model = knowledge_model if knowledge_model is not None else model
1393
+
1394
+ if cache_path and os.path.exists(cache_path):
1395
+ try:
1396
+ with open(cache_path, "r") as f:
1397
+ txt = f.read()
1398
+ return txt if txt.strip() else None
1399
+ except Exception:
1400
+ pass
1401
+
1402
+ # Assemble evidence from all sources
1403
+ pages = []
1404
+ for r in search_results[:10]:
1405
+ url = r.get("url", "")
1406
+ analysis = r.get("analysis", "")
1407
+ if not analysis:
1408
+ continue
1409
+ pages.append({"url": url, "content": analysis[:12000]})
1410
+
1411
+ docker_bits = []
1412
+ for path, content in dockerfile_contents[:5]:
1413
+ if content.strip():
1414
+ docker_bits.append({"path": path, "content": content[:8000]})
1415
+
1416
+ # Add requirement files (limit size to prevent token overflow)
1417
+ req_files = []
1418
+ for path, content in (requirement_files or [])[:10]:
1419
+ if content.strip():
1420
+ req_files.append({"path": path, "content": content[:15000]})
1421
+
1422
+ # Add workflow files (CI/CD)
1423
+ workflow_bits = []
1424
+ for path, content in (workflow_contents or [])[:5]:
1425
+ if content.strip():
1426
+ workflow_bits.append({"path": path, "content": content[:10000]})
1427
+
1428
+ # Check if we have any meaningful content
1429
+ has_content = pages or docker_bits or req_files or workflow_bits or readme_content
1430
+ if not has_content:
1431
+ return None
1432
+
1433
+ query = search_workflows_summary_prompt.format(project_name)
1434
+
1435
+ payload = {
1436
+ "project": project_name,
1437
+ "language": language,
1438
+ "web_pages": pages,
1439
+ "dockerfiles": docker_bits,
1440
+ "requirement_files": req_files,
1441
+ "workflow_files": workflow_bits,
1442
+ "readme_content": (readme_content[:20000] if readme_content else ""),
1443
+ }
1444
+
1445
+ resp = summary_model.query([
1446
+ {"role": "user", "content": query + "\n\nEvidence (JSON):\n" + json.dumps(payload, indent=2)}
1447
+ ])
1448
+ summary = resp.get("content", "").strip()
1449
+ if cache_path:
1450
+ os.makedirs(os.path.dirname(cache_path), exist_ok=True)
1451
+ try:
1452
+ with open(cache_path, "w") as f:
1453
+ f.write(summary)
1454
+ except Exception:
1455
+ pass
1456
+ return summary if summary else None
1457
+
1458
+ # ---------- main entry ----------
1459
+ def build_repo_context(
1460
+ self,
1461
+ *,
1462
+ model,
1463
+ knowledge_model=None,
1464
+ project_path: str,
1465
+ project_url: str,
1466
+ language: str,
1467
+ search_workflows_summary_prompt: str,
1468
+ unified_summary_cache_root: str = "search_logs_unified",
1469
+ perform_web_search_if_missing: bool = True,
1470
+ ) -> RepoContext:
1471
+ """
1472
+ Build the repository context including workflows, dockerfiles, requirement files,
1473
+ README, and web search results.
1474
+
1475
+ Args:
1476
+ model: Main model used for CI/CD file filtering
1477
+ knowledge_model: Separate model for web search analysis and unified summary
1478
+ generation (e.g., gpt-5-mini). This model should be up-to-date
1479
+ and knowledgeable about current technologies and best practices.
1480
+ If not provided, falls back to the main model.
1481
+ project_path: Path/name of the project
1482
+ project_url: Git URL of the project
1483
+ language: Programming language of the project
1484
+ search_workflows_summary_prompt: Prompt template for summary generation
1485
+ unified_summary_cache_root: Root directory for caching unified summaries
1486
+ perform_web_search_if_missing: Whether to perform web search if no cached results
1487
+ """
1488
+ # Clone the repository first so the agent can explore it before creating a container
1489
+ local_repo_available = self.clone_repo(project_path, project_url)
1490
+
1491
+ # Find and load workflow files (CI/CD)
1492
+ workflows = self.find_workflows(project_path, filter_by_keywords=False)
1493
+ workflow_contents = self.load_workflow_contents(
1494
+ workflows, project_name=project_path, model=model, use_llm_filter=True
1495
+ )
1496
+
1497
+ # Find and load Dockerfiles
1498
+ dockerfiles = self.find_dockerfiles(project_path)
1499
+ dockerfile_contents = self.load_dockerfile_contents(dockerfiles, project_name=project_path)
1500
+
1501
+ # Find and load requirement/dependency files (language-specific heuristic)
1502
+ _LOG.info(f"Searching for requirement files for language: {language}")
1503
+ requirement_paths = self.find_requirement_files(project_path, language)
1504
+ requirement_files = self.load_requirement_files(requirement_paths, project_name=project_path)
1505
+ _LOG.info(f"Found {len(requirement_files)} requirement/dependency files")
1506
+
1507
+ # Find and load README content
1508
+ readme_path = self.find_readme(project_path)
1509
+ readme_content = self.load_readme_content(readme_path, project_name=project_path)
1510
+ if readme_content:
1511
+ _LOG.info(f"Loaded README content ({len(readme_content)} chars)")
1512
+ else:
1513
+ _LOG.info("No README file found or content extracted")
1514
+
1515
+ # Try to load cached search results first
1516
+ search_results = self.load_cached_search_results(project_path)
1517
+
1518
+ # If no cached results and web search is enabled, perform web search
1519
+ # Use knowledge_model for web content analysis (more up-to-date knowledge)
1520
+ if not search_results and perform_web_search_if_missing:
1521
+ _LOG.info(f"No cached search results for '{project_path}', performing web search...")
1522
+ search_results = self.perform_web_search(
1523
+ project_name=project_path,
1524
+ model=model,
1525
+ knowledge_model=knowledge_model,
1526
+ max_results=5,
1527
+ )
1528
+ # Save results for future use
1529
+ if search_results:
1530
+ self.save_search_results(project_path, search_results)
1531
+
1532
+ problems_memory = self.load_problems_memory(project_path)
1533
+
1534
+ # Use knowledge_model for unified summary (more up-to-date knowledge)
1535
+ # Pass all collected information to create a comprehensive summary
1536
+ cache_path = os.path.join(unified_summary_cache_root, project_path, "unified_summary.txt")
1537
+ unified_summary = self.build_unified_summary(
1538
+ model=model,
1539
+ knowledge_model=knowledge_model,
1540
+ search_workflows_summary_prompt=search_workflows_summary_prompt,
1541
+ project_name=project_path,
1542
+ language=language,
1543
+ search_results=search_results,
1544
+ dockerfile_contents=dockerfile_contents,
1545
+ requirement_files=requirement_files,
1546
+ readme_content=readme_content,
1547
+ workflow_contents=workflow_contents,
1548
+ cache_path=cache_path,
1549
+ )
1550
+
1551
+ return RepoContext(
1552
+ project_path=project_path,
1553
+ project_url=project_url,
1554
+ language=language,
1555
+ workflows=workflows,
1556
+ workflow_contents=workflow_contents,
1557
+ dockerfiles=dockerfiles,
1558
+ dockerfile_contents=dockerfile_contents,
1559
+ search_results=search_results,
1560
+ requirement_files=requirement_files,
1561
+ readme_content=readme_content,
1562
+ unified_summary=unified_summary,
1563
+ problems_memory=problems_memory,
1564
+ local_repo_available=local_repo_available,
1565
+ )