monoco-toolkit 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. monoco/__main__.py +8 -0
  2. monoco/core/artifacts/__init__.py +16 -0
  3. monoco/core/artifacts/manager.py +575 -0
  4. monoco/core/artifacts/models.py +161 -0
  5. monoco/core/config.py +38 -4
  6. monoco/core/git.py +23 -0
  7. monoco/core/hooks/builtin/git_cleanup.py +1 -1
  8. monoco/core/ingestion/__init__.py +20 -0
  9. monoco/core/ingestion/discovery.py +248 -0
  10. monoco/core/ingestion/watcher.py +343 -0
  11. monoco/core/ingestion/worker.py +436 -0
  12. monoco/core/injection.py +63 -29
  13. monoco/core/integrations.py +2 -2
  14. monoco/core/loader.py +633 -0
  15. monoco/core/output.py +5 -5
  16. monoco/core/registry.py +34 -19
  17. monoco/core/resource/__init__.py +5 -0
  18. monoco/core/resource/finder.py +98 -0
  19. monoco/core/resource/manager.py +91 -0
  20. monoco/core/resource/models.py +35 -0
  21. monoco/core/skill_framework.py +292 -0
  22. monoco/core/skills.py +524 -385
  23. monoco/core/sync.py +73 -1
  24. monoco/core/workflow_converter.py +420 -0
  25. monoco/daemon/app.py +77 -1
  26. monoco/daemon/commands.py +10 -0
  27. monoco/daemon/mailroom_service.py +196 -0
  28. monoco/daemon/models.py +1 -0
  29. monoco/daemon/scheduler.py +236 -0
  30. monoco/daemon/services.py +185 -0
  31. monoco/daemon/triggers.py +55 -0
  32. monoco/features/agent/__init__.py +2 -2
  33. monoco/features/agent/adapter.py +41 -0
  34. monoco/features/agent/apoptosis.py +44 -0
  35. monoco/features/agent/cli.py +101 -144
  36. monoco/features/agent/config.py +35 -21
  37. monoco/features/agent/defaults.py +6 -49
  38. monoco/features/agent/engines.py +32 -6
  39. monoco/features/agent/manager.py +47 -6
  40. monoco/features/agent/models.py +2 -2
  41. monoco/features/agent/resources/atoms/atom-code-dev.yaml +61 -0
  42. monoco/features/agent/resources/atoms/atom-issue-lifecycle.yaml +73 -0
  43. monoco/features/agent/resources/atoms/atom-knowledge.yaml +55 -0
  44. monoco/features/agent/resources/atoms/atom-review.yaml +60 -0
  45. monoco/{core/resources/en → features/agent/resources/en/skills/monoco_atom_core}/SKILL.md +3 -1
  46. monoco/features/agent/resources/en/skills/monoco_workflow_agent_engineer/SKILL.md +94 -0
  47. monoco/features/agent/resources/en/skills/monoco_workflow_agent_manager/SKILL.md +93 -0
  48. monoco/features/agent/resources/en/skills/monoco_workflow_agent_planner/SKILL.md +85 -0
  49. monoco/features/agent/resources/en/skills/monoco_workflow_agent_reviewer/SKILL.md +114 -0
  50. monoco/features/agent/resources/workflows/workflow-dev.yaml +83 -0
  51. monoco/features/agent/resources/workflows/workflow-issue-create.yaml +72 -0
  52. monoco/features/agent/resources/workflows/workflow-review.yaml +94 -0
  53. monoco/features/agent/resources/zh/roles/monoco_role_engineer.yaml +49 -0
  54. monoco/features/agent/resources/zh/roles/monoco_role_manager.yaml +46 -0
  55. monoco/features/agent/resources/zh/roles/monoco_role_planner.yaml +46 -0
  56. monoco/features/agent/resources/zh/roles/monoco_role_reviewer.yaml +47 -0
  57. monoco/{core/resources/zh → features/agent/resources/zh/skills/monoco_atom_core}/SKILL.md +3 -1
  58. monoco/features/agent/resources/{skills/flow_engineer → zh/skills/monoco_workflow_agent_engineer}/SKILL.md +2 -2
  59. monoco/features/agent/resources/{skills/flow_manager → zh/skills/monoco_workflow_agent_manager}/SKILL.md +2 -2
  60. monoco/features/agent/resources/zh/skills/monoco_workflow_agent_planner/SKILL.md +259 -0
  61. monoco/features/agent/resources/zh/skills/monoco_workflow_agent_reviewer/SKILL.md +137 -0
  62. monoco/features/agent/session.py +59 -11
  63. monoco/features/agent/worker.py +38 -2
  64. monoco/features/artifact/__init__.py +0 -0
  65. monoco/features/artifact/adapter.py +33 -0
  66. monoco/features/artifact/resources/zh/AGENTS.md +14 -0
  67. monoco/features/artifact/resources/zh/skills/monoco_atom_artifact/SKILL.md +278 -0
  68. monoco/features/glossary/__init__.py +0 -0
  69. monoco/features/glossary/adapter.py +42 -0
  70. monoco/features/glossary/config.py +5 -0
  71. monoco/features/glossary/resources/en/AGENTS.md +29 -0
  72. monoco/features/glossary/resources/en/skills/monoco_atom_glossary/SKILL.md +35 -0
  73. monoco/features/glossary/resources/zh/AGENTS.md +29 -0
  74. monoco/features/glossary/resources/zh/skills/monoco_atom_glossary/SKILL.md +35 -0
  75. monoco/features/hooks/__init__.py +11 -0
  76. monoco/features/hooks/adapter.py +67 -0
  77. monoco/features/hooks/commands.py +309 -0
  78. monoco/features/hooks/core.py +441 -0
  79. monoco/features/hooks/resources/ADDING_HOOKS.md +234 -0
  80. monoco/features/i18n/adapter.py +18 -5
  81. monoco/features/i18n/core.py +482 -17
  82. monoco/features/i18n/resources/en/{SKILL.md → skills/monoco_atom_i18n/SKILL.md} +3 -1
  83. monoco/features/i18n/resources/en/skills/monoco_workflow_i18n_scan/SKILL.md +105 -0
  84. monoco/features/i18n/resources/zh/{SKILL.md → skills/monoco_atom_i18n/SKILL.md} +3 -1
  85. monoco/features/i18n/resources/{skills/i18n_scan_workflow → zh/skills/monoco_workflow_i18n_scan}/SKILL.md +2 -2
  86. monoco/features/issue/adapter.py +19 -6
  87. monoco/features/issue/commands.py +281 -7
  88. monoco/features/issue/core.py +272 -19
  89. monoco/features/issue/engine/machine.py +118 -5
  90. monoco/features/issue/linter.py +60 -5
  91. monoco/features/issue/models.py +3 -2
  92. monoco/features/issue/resources/en/AGENTS.md +109 -0
  93. monoco/features/issue/resources/en/{SKILL.md → skills/monoco_atom_issue/SKILL.md} +3 -1
  94. monoco/features/issue/resources/en/skills/monoco_workflow_issue_creation/SKILL.md +167 -0
  95. monoco/features/issue/resources/en/skills/monoco_workflow_issue_development/SKILL.md +224 -0
  96. monoco/features/issue/resources/en/skills/monoco_workflow_issue_management/SKILL.md +159 -0
  97. monoco/features/issue/resources/en/skills/monoco_workflow_issue_refinement/SKILL.md +203 -0
  98. monoco/features/issue/resources/hooks/post-checkout.sh +39 -0
  99. monoco/features/issue/resources/hooks/pre-commit.sh +41 -0
  100. monoco/features/issue/resources/hooks/pre-push.sh +35 -0
  101. monoco/features/issue/resources/zh/AGENTS.md +109 -0
  102. monoco/features/issue/resources/zh/{SKILL.md → skills/monoco_atom_issue_lifecycle/SKILL.md} +3 -1
  103. monoco/features/issue/resources/zh/skills/monoco_workflow_issue_creation/SKILL.md +167 -0
  104. monoco/features/issue/resources/zh/skills/monoco_workflow_issue_development/SKILL.md +224 -0
  105. monoco/features/issue/resources/{skills/issue_lifecycle_workflow → zh/skills/monoco_workflow_issue_management}/SKILL.md +2 -2
  106. monoco/features/issue/resources/zh/skills/monoco_workflow_issue_refinement/SKILL.md +203 -0
  107. monoco/features/issue/validator.py +101 -1
  108. monoco/features/memo/adapter.py +21 -8
  109. monoco/features/memo/cli.py +103 -10
  110. monoco/features/memo/core.py +178 -92
  111. monoco/features/memo/models.py +53 -0
  112. monoco/features/memo/resources/en/skills/monoco_atom_memo/SKILL.md +77 -0
  113. monoco/features/memo/resources/en/skills/monoco_workflow_note_processing/SKILL.md +140 -0
  114. monoco/features/memo/resources/zh/{SKILL.md → skills/monoco_atom_memo/SKILL.md} +3 -1
  115. monoco/features/memo/resources/{skills/note_processing_workflow → zh/skills/monoco_workflow_note_processing}/SKILL.md +2 -2
  116. monoco/features/spike/adapter.py +18 -5
  117. monoco/features/spike/resources/en/{SKILL.md → skills/monoco_atom_spike/SKILL.md} +3 -1
  118. monoco/features/spike/resources/en/skills/monoco_workflow_research/SKILL.md +121 -0
  119. monoco/features/spike/resources/zh/{SKILL.md → skills/monoco_atom_spike/SKILL.md} +3 -1
  120. monoco/features/spike/resources/{skills/research_workflow → zh/skills/monoco_workflow_research}/SKILL.md +2 -2
  121. monoco/main.py +38 -1
  122. monoco_toolkit-0.3.11.dist-info/METADATA +130 -0
  123. monoco_toolkit-0.3.11.dist-info/RECORD +181 -0
  124. monoco/features/agent/reliability.py +0 -106
  125. monoco/features/agent/resources/skills/flow_reviewer/SKILL.md +0 -114
  126. monoco_toolkit-0.3.9.dist-info/METADATA +0 -127
  127. monoco_toolkit-0.3.9.dist-info/RECORD +0 -115
  128. /monoco/{core → features/agent}/resources/en/AGENTS.md +0 -0
  129. /monoco/{core → features/agent}/resources/zh/AGENTS.md +0 -0
  130. {monoco_toolkit-0.3.9.dist-info → monoco_toolkit-0.3.11.dist-info}/WHEEL +0 -0
  131. {monoco_toolkit-0.3.9.dist-info → monoco_toolkit-0.3.11.dist-info}/entry_points.txt +0 -0
  132. {monoco_toolkit-0.3.9.dist-info → monoco_toolkit-0.3.11.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,161 @@
1
+ """
2
+ Artifact data models for Monoco Artifact System.
3
+
4
+ Defines the metadata structure, enums, and data classes for artifact management.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import hashlib
10
+ import json
11
+ from datetime import datetime, timezone
12
+ from enum import Enum
13
+ from pathlib import Path
14
+ from typing import Any, Optional
15
+
16
+ from pydantic import BaseModel, Field, field_validator
17
+
18
+
19
+ class ArtifactSourceType(str, Enum):
20
+ """Source type of the artifact."""
21
+
22
+ GENERATED = "generated" # AI-generated content
23
+ UPLOADED = "uploaded" # User-uploaded file
24
+ IMPORTED = "imported" # Imported from external source
25
+ DERIVED = "derived" # Derived from another artifact
26
+
27
+
28
+ class ArtifactStatus(str, Enum):
29
+ """Lifecycle status of the artifact."""
30
+
31
+ ACTIVE = "active"
32
+ ARCHIVED = "archived"
33
+ EXPIRED = "expired"
34
+ DELETED = "deleted"
35
+
36
+
37
+ class ArtifactMetadata(BaseModel):
38
+ """
39
+ Metadata record for an artifact in the manifest.
40
+
41
+ Each artifact is uniquely identified by its content hash (SHA256).
42
+ The manifest.jsonl contains one JSON line per artifact metadata.
43
+ """
44
+
45
+ artifact_id: str = Field(
46
+ description="Unique identifier (ULID or UUID) for the artifact instance"
47
+ )
48
+ content_hash: str = Field(
49
+ description="SHA256 hash of the artifact content (CAS address)"
50
+ )
51
+ source_type: ArtifactSourceType = Field(description="How the artifact was created")
52
+ status: ArtifactStatus = Field(
53
+ default=ArtifactStatus.ACTIVE, description="Current lifecycle status"
54
+ )
55
+ created_at: datetime = Field(
56
+ default_factory=lambda: datetime.now(timezone.utc), description="Creation timestamp (UTC)"
57
+ )
58
+ updated_at: datetime = Field(
59
+ default_factory=lambda: datetime.now(timezone.utc), description="Last update timestamp (UTC)"
60
+ )
61
+ expires_at: Optional[datetime] = Field(
62
+ default=None, description="Optional expiration timestamp"
63
+ )
64
+ content_type: str = Field(
65
+ default="application/octet-stream", description="MIME type of the content"
66
+ )
67
+ size_bytes: int = Field(default=0, description="Size of the artifact in bytes")
68
+ original_filename: Optional[str] = Field(
69
+ default=None, description="Original filename if uploaded"
70
+ )
71
+ source_url: Optional[str] = Field(
72
+ default=None, description="Source URL if imported from external"
73
+ )
74
+ parent_artifact_id: Optional[str] = Field(
75
+ default=None, description="Parent artifact ID if this is derived"
76
+ )
77
+ tags: list[str] = Field(default_factory=list, description="User-defined tags")
78
+ metadata: dict[str, Any] = Field(
79
+ default_factory=dict, description="Additional metadata key-value pairs"
80
+ )
81
+
82
+ @field_validator("content_hash")
83
+ @classmethod
84
+ def validate_content_hash(cls, v: str) -> str:
85
+ """Validate that content_hash is a valid SHA256 hex string."""
86
+ if len(v) != 64:
87
+ raise ValueError("content_hash must be a 64-character SHA256 hex string")
88
+ try:
89
+ int(v, 16)
90
+ except ValueError:
91
+ raise ValueError("content_hash must be a valid hex string")
92
+ return v
93
+
94
+ def to_jsonl_line(self) -> str:
95
+ """Serialize to a single JSON line for manifest.jsonl."""
96
+ return json.dumps(self.model_dump(mode="json"), ensure_ascii=False) + "\n"
97
+
98
+ @classmethod
99
+ def from_jsonl_line(cls, line: str) -> ArtifactMetadata:
100
+ """Deserialize from a JSON line."""
101
+ data = json.loads(line.strip())
102
+ return cls.model_validate(data)
103
+
104
+ @property
105
+ def is_expired(self) -> bool:
106
+ """Check if the artifact has expired."""
107
+ if self.expires_at is None:
108
+ return False
109
+ return datetime.now(timezone.utc) > self.expires_at
110
+
111
+ @property
112
+ def cas_path_components(self) -> tuple[str, str, str]:
113
+ """
114
+ Generate CAS storage path components from content_hash.
115
+
116
+ Returns (prefix1, prefix2, filename) for tiered directory structure.
117
+ Example: hash='abc123...' -> ('ab', 'c1', 'abc123...')
118
+ """
119
+ if len(self.content_hash) < 4:
120
+ raise ValueError("content_hash too short for path generation")
121
+ return (
122
+ self.content_hash[:2],
123
+ self.content_hash[2:4],
124
+ self.content_hash,
125
+ )
126
+
127
+ @property
128
+ def cas_relative_path(self) -> str:
129
+ """Get the relative CAS path for this artifact."""
130
+ p1, p2, filename = self.cas_path_components
131
+ return f"{p1}/{p2}/{filename}"
132
+
133
+
134
+ def compute_content_hash(content: bytes) -> str:
135
+ """
136
+ Compute SHA256 hash of content for CAS addressing.
137
+
138
+ Args:
139
+ content: Raw bytes of the artifact content
140
+
141
+ Returns:
142
+ 64-character lowercase hex string of the SHA256 hash
143
+ """
144
+ return hashlib.sha256(content).hexdigest()
145
+
146
+
147
+ def compute_file_hash(file_path: Path) -> str:
148
+ """
149
+ Compute SHA256 hash of a file for CAS addressing.
150
+
151
+ Args:
152
+ file_path: Path to the file to hash
153
+
154
+ Returns:
155
+ 64-character lowercase hex string of the SHA256 hash
156
+ """
157
+ hasher = hashlib.sha256()
158
+ with open(file_path, "rb") as f:
159
+ for chunk in iter(lambda: f.read(8192), b""):
160
+ hasher.update(chunk)
161
+ return hasher.hexdigest()
monoco/core/config.py CHANGED
@@ -72,6 +72,24 @@ class TelemetryConfig(BaseModel):
72
72
  )
73
73
 
74
74
 
75
+ class HooksConfig(BaseModel):
76
+ """Configuration for git hooks management."""
77
+
78
+ enabled: bool = Field(default=True, description="Whether hooks system is enabled")
79
+ features: Dict[str, bool] = Field(
80
+ default_factory=dict,
81
+ description="Per-feature hook enable/disable (feature_name -> enabled)"
82
+ )
83
+ hooks: Dict[str, bool] = Field(
84
+ default_factory=lambda: {
85
+ "pre-commit": True,
86
+ "pre-push": False,
87
+ "post-checkout": False,
88
+ },
89
+ description="Per-hook-type enable/disable (hook_type -> enabled)"
90
+ )
91
+
92
+
75
93
  class IssueTypeConfig(BaseModel):
76
94
  name: str
77
95
  label: str
@@ -91,6 +109,7 @@ class TransitionConfig(BaseModel):
91
109
  required_solution: Optional[str] = None
92
110
  description: str = ""
93
111
  command_template: Optional[str] = None
112
+ post_actions: List[str] = Field(default_factory=list)
94
113
 
95
114
 
96
115
  class CriticalityRuleConfig(BaseModel):
@@ -135,6 +154,23 @@ class CriticalityConfig(BaseModel):
135
154
  return self
136
155
 
137
156
 
157
+ class AgentConcurrencyConfig(BaseModel):
158
+ """Configuration for agent concurrency limits (semaphore-based)."""
159
+ global_max: int = Field(default=3, description="Global maximum concurrent agents across all roles")
160
+ engineer: int = Field(default=1, description="Maximum concurrent Engineer agents")
161
+ architect: int = Field(default=1, description="Maximum concurrent Architect agents")
162
+ reviewer: int = Field(default=1, description="Maximum concurrent Reviewer agents")
163
+ planner: int = Field(default=1, description="Maximum concurrent Planner agents")
164
+ # Cool-down configuration
165
+ failure_cooldown_seconds: int = Field(default=60, description="Cooldown period after a failure before retrying")
166
+
167
+
168
+ class AgentConfig(BaseModel):
169
+ """Configuration for AI Agents."""
170
+ timeout_seconds: int = Field(default=900, description="Global timeout for agent sessions")
171
+ concurrency: AgentConcurrencyConfig = Field(default_factory=AgentConcurrencyConfig)
172
+
173
+
138
174
  class IssueSchemaConfig(BaseModel):
139
175
  types: List[IssueTypeConfig] = Field(default_factory=list)
140
176
  statuses: List[str] = Field(default_factory=list)
@@ -230,10 +266,7 @@ class MonocoConfig(BaseModel):
230
266
  i18n: I18nConfig = Field(default_factory=I18nConfig)
231
267
  ui: UIConfig = Field(default_factory=UIConfig)
232
268
  telemetry: TelemetryConfig = Field(default_factory=TelemetryConfig)
233
- hooks: Dict[str, str] = Field(
234
- default_factory=dict,
235
- description="Git hooks configuration (hook_name -> command)",
236
- )
269
+ hooks: HooksConfig = Field(default_factory=HooksConfig)
237
270
  session_hooks: Dict[str, Any] = Field(
238
271
  default_factory=dict,
239
272
  description="Session lifecycle hooks configuration (hook_name -> config)",
@@ -241,6 +274,7 @@ class MonocoConfig(BaseModel):
241
274
 
242
275
  issue: IssueSchemaConfig = Field(default_factory=IssueSchemaConfig)
243
276
  domains: DomainConfig = Field(default_factory=DomainConfig)
277
+ agent: AgentConfig = Field(default_factory=AgentConfig)
244
278
 
245
279
  @staticmethod
246
280
  def _deep_merge(base: Dict[str, Any], update: Dict[str, Any]) -> Dict[str, Any]:
monoco/core/git.py CHANGED
@@ -149,6 +149,29 @@ def delete_branch(path: Path, branch_name: str, force: bool = False):
149
149
  raise RuntimeError(f"Failed to delete branch {branch_name}: {stderr}")
150
150
 
151
151
 
152
+ def get_merge_base(path: Path, ref1: str, ref2: str) -> str:
153
+ code, stdout, stderr = _run_git(["merge-base", ref1, ref2], path)
154
+ if code != 0:
155
+ raise RuntimeError(f"Failed to find merge base: {stderr}")
156
+ return stdout.strip()
157
+
158
+
159
+ def git_checkout_files(path: Path, ref: str, files: List[str]):
160
+ if not files:
161
+ return
162
+ code, _, stderr = _run_git(["checkout", ref, "--"] + files, path)
163
+ if code != 0:
164
+ raise RuntimeError(f"Failed to checkout files from {ref}: {stderr}")
165
+
166
+
167
+ def has_diff(path: Path, ref1: str, ref2: str, files: List[str]) -> bool:
168
+ """Check if there are differences between two refs for specific files."""
169
+ if not files:
170
+ return False
171
+ code, stdout, _ = _run_git(["diff", "--name-only", ref1, ref2, "--"] + files, path)
172
+ return code == 0 and bool(stdout.strip())
173
+
174
+
152
175
  def get_worktrees(path: Path) -> List[Tuple[str, str, str]]:
153
176
  """Returns list of (path, head, branch)"""
154
177
  code, stdout, stderr = _run_git(["worktree", "list", "--porcelain"], path)
@@ -31,7 +31,7 @@ class GitCleanupHook(SessionLifecycleHook):
31
31
 
32
32
  # Configuration with defaults
33
33
  self.auto_switch_to_main = self.config.get("auto_switch_to_main", True)
34
- self.auto_delete_merged_branches = self.config.get("auto_delete_merged_branches", True)
34
+ self.auto_delete_merged_branches = self.config.get("auto_delete_merged_branches", False)
35
35
  self.main_branch = self.config.get("main_branch", "main")
36
36
  self.require_clean_worktree = self.config.get("require_clean_worktree", True)
37
37
 
@@ -0,0 +1,20 @@
1
+ """
2
+ Monoco Mailroom - Automated Ingestion System
3
+
4
+ Provides environment discovery, file watching, and automated conversion
5
+ for document ingestion into the Monoco Artifact System.
6
+ """
7
+
8
+ from .discovery import EnvironmentDiscovery, ConversionTool
9
+ from .worker import ConversionWorker, ConversionTask, ConversionResult
10
+ from .watcher import DropzoneWatcher, IngestionEvent
11
+
12
+ __all__ = [
13
+ "EnvironmentDiscovery",
14
+ "ConversionTool",
15
+ "ConversionWorker",
16
+ "ConversionTask",
17
+ "ConversionResult",
18
+ "DropzoneWatcher",
19
+ "IngestionEvent",
20
+ ]
@@ -0,0 +1,248 @@
1
+ """
2
+ Environment Discovery Module for Monoco Mailroom.
3
+
4
+ Automatically detects available document conversion tools in the system,
5
+ including LibreOffice (soffice), Pandoc, and PDF processing engines.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import shutil
11
+ import subprocess
12
+ from dataclasses import dataclass, field
13
+ from enum import Enum
14
+ from pathlib import Path
15
+ from typing import Optional
16
+
17
+
18
+ class ToolType(str, Enum):
19
+ """Types of conversion tools supported."""
20
+ LIBREOFFICE = "libreoffice"
21
+ PANDOC = "pandoc"
22
+ PDF2TEXT = "pdf2text"
23
+ PDFTOHTML = "pdftohtml"
24
+ CUSTOM = "custom"
25
+
26
+
27
+ class ToolCapability(str, Enum):
28
+ """Capabilities of conversion tools."""
29
+ DOCX_TO_TEXT = "docx_to_text"
30
+ DOCX_TO_MD = "docx_to_md"
31
+ PDF_TO_TEXT = "pdf_to_text"
32
+ PDF_TO_HTML = "pdf_to_html"
33
+ ODT_TO_TEXT = "odt_to_text"
34
+ XLSX_TO_CSV = "xlsx_to_csv"
35
+ PPTX_TO_TEXT = "pptx_to_text"
36
+
37
+
38
+ @dataclass
39
+ class ConversionTool:
40
+ """Represents a discovered conversion tool."""
41
+ name: str
42
+ tool_type: ToolType
43
+ executable_path: Path
44
+ version: str = "unknown"
45
+ capabilities: list[ToolCapability] = field(default_factory=list)
46
+ priority: int = 0 # Higher = preferred
47
+
48
+ def is_available(self) -> bool:
49
+ """Check if the tool executable exists and is runnable."""
50
+ return self.executable_path.exists() and os.access(self.executable_path, os.X_OK)
51
+
52
+
53
+ class EnvironmentDiscovery:
54
+ """
55
+ Discovers and manages document conversion tools in the system.
56
+
57
+ Automatically detects:
58
+ - LibreOffice (soffice) for Office document conversion
59
+ - Pandoc for markdown/text conversion
60
+ - PDF utilities (pdftotext, pdftohtml)
61
+ """
62
+
63
+ # Known executable names to search for
64
+ LIBREOFFICE_BINARIES = ["soffice", "libreoffice", "soffice.bin"]
65
+ PANDOC_BINARIES = ["pandoc"]
66
+ PDF_TOOLS = ["pdftotext", "pdftohtml", "pdf2txt.py"]
67
+
68
+ def __init__(self):
69
+ self._tools: dict[ToolType, list[ConversionTool]] = {}
70
+ self._discovered = False
71
+
72
+ def discover(self, force: bool = False) -> dict[ToolType, list[ConversionTool]]:
73
+ """
74
+ Discover all available conversion tools.
75
+
76
+ Args:
77
+ force: Force re-discovery even if already done
78
+
79
+ Returns:
80
+ Dictionary mapping tool types to lists of discovered tools
81
+ """
82
+ if self._discovered and not force:
83
+ return self._tools
84
+
85
+ self._tools = {
86
+ ToolType.LIBREOFFICE: self._discover_libreoffice(),
87
+ ToolType.PANDOC: self._discover_pandoc(),
88
+ ToolType.PDF2TEXT: self._discover_pdf_tools(),
89
+ }
90
+
91
+ self._discovered = True
92
+ return self._tools
93
+
94
+ def _find_executable(self, names: list[str]) -> Optional[Path]:
95
+ """Find the first available executable from a list of names."""
96
+ for name in names:
97
+ path = shutil.which(name)
98
+ if path:
99
+ return Path(path).resolve()
100
+ return None
101
+
102
+ def _get_version(self, executable: Path, version_arg: str = "--version") -> str:
103
+ """Get version string from an executable."""
104
+ try:
105
+ result = subprocess.run(
106
+ [str(executable), version_arg],
107
+ capture_output=True,
108
+ text=True,
109
+ timeout=5,
110
+ check=False,
111
+ )
112
+ # Extract version from first line of output
113
+ output = result.stdout or result.stderr
114
+ if output:
115
+ first_line = output.strip().split("\n")[0]
116
+ return first_line
117
+ except (subprocess.TimeoutExpired, OSError, ValueError):
118
+ pass
119
+ return "unknown"
120
+
121
+ def _discover_libreoffice(self) -> list[ConversionTool]:
122
+ """Discover LibreOffice installation."""
123
+ tools = []
124
+ executable = self._find_executable(self.LIBREOFFICE_BINARIES)
125
+
126
+ if executable:
127
+ version = self._get_version(executable)
128
+ tools.append(ConversionTool(
129
+ name="LibreOffice",
130
+ tool_type=ToolType.LIBREOFFICE,
131
+ executable_path=executable,
132
+ version=version,
133
+ capabilities=[
134
+ ToolCapability.DOCX_TO_TEXT,
135
+ ToolCapability.DOCX_TO_MD,
136
+ ToolCapability.ODT_TO_TEXT,
137
+ ToolCapability.XLSX_TO_CSV,
138
+ ToolCapability.PPTX_TO_TEXT,
139
+ ],
140
+ priority=100, # High priority for Office docs
141
+ ))
142
+
143
+ return tools
144
+
145
+ def _discover_pandoc(self) -> list[ConversionTool]:
146
+ """Discover Pandoc installation."""
147
+ tools = []
148
+ executable = self._find_executable(self.PANDOC_BINARIES)
149
+
150
+ if executable:
151
+ version = self._get_version(executable)
152
+ tools.append(ConversionTool(
153
+ name="Pandoc",
154
+ tool_type=ToolType.PANDOC,
155
+ executable_path=executable,
156
+ version=version,
157
+ capabilities=[
158
+ ToolCapability.DOCX_TO_MD,
159
+ ToolCapability.DOCX_TO_TEXT,
160
+ ToolCapability.ODT_TO_TEXT,
161
+ ],
162
+ priority=90,
163
+ ))
164
+
165
+ return tools
166
+
167
+ def _discover_pdf_tools(self) -> list[ConversionTool]:
168
+ """Discover PDF conversion tools."""
169
+ tools = []
170
+
171
+ # pdftotext (from poppler-utils)
172
+ pdftotext = self._find_executable(["pdftotext"])
173
+ if pdftotext:
174
+ version = self._get_version(pdftotext, "-v")
175
+ tools.append(ConversionTool(
176
+ name="pdftotext",
177
+ tool_type=ToolType.PDF2TEXT,
178
+ executable_path=pdftotext,
179
+ version=version,
180
+ capabilities=[ToolCapability.PDF_TO_TEXT],
181
+ priority=100,
182
+ ))
183
+
184
+ # pdftohtml
185
+ pdftohtml = self._find_executable(["pdftohtml"])
186
+ if pdftohtml:
187
+ version = self._get_version(pdftohtml, "-v")
188
+ tools.append(ConversionTool(
189
+ name="pdftohtml",
190
+ tool_type=ToolType.PDFTOHTML,
191
+ executable_path=pdftohtml,
192
+ version=version,
193
+ capabilities=[ToolCapability.PDF_TO_HTML],
194
+ priority=80,
195
+ ))
196
+
197
+ return tools
198
+
199
+ def get_best_tool(self, capability: ToolCapability) -> Optional[ConversionTool]:
200
+ """
201
+ Get the best available tool for a specific capability.
202
+
203
+ Args:
204
+ capability: The required conversion capability
205
+
206
+ Returns:
207
+ Best matching ConversionTool or None
208
+ """
209
+ if not self._discovered:
210
+ self.discover()
211
+
212
+ candidates = []
213
+ for tool_list in self._tools.values():
214
+ for tool in tool_list:
215
+ if capability in tool.capabilities:
216
+ candidates.append(tool)
217
+
218
+ if not candidates:
219
+ return None
220
+
221
+ # Sort by priority (highest first)
222
+ candidates.sort(key=lambda t: t.priority, reverse=True)
223
+ return candidates[0]
224
+
225
+ def get_all_tools(self) -> list[ConversionTool]:
226
+ """Get all discovered tools."""
227
+ if not self._discovered:
228
+ self.discover()
229
+
230
+ all_tools = []
231
+ for tool_list in self._tools.values():
232
+ all_tools.extend(tool_list)
233
+ return all_tools
234
+
235
+ def has_capability(self, capability: ToolCapability) -> bool:
236
+ """Check if any tool supports the given capability."""
237
+ return self.get_best_tool(capability) is not None
238
+
239
+ def get_capabilities_summary(self) -> dict[str, bool]:
240
+ """Get a summary of available capabilities."""
241
+ return {
242
+ cap.value: self.has_capability(cap)
243
+ for cap in ToolCapability
244
+ }
245
+
246
+
247
+ # Import os here to avoid issues with dataclass
248
+ import os