monoco-toolkit 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- monoco/__main__.py +8 -0
- monoco/core/artifacts/__init__.py +16 -0
- monoco/core/artifacts/manager.py +575 -0
- monoco/core/artifacts/models.py +161 -0
- monoco/core/automation/__init__.py +51 -0
- monoco/core/automation/config.py +338 -0
- monoco/core/automation/field_watcher.py +296 -0
- monoco/core/automation/handlers.py +723 -0
- monoco/core/config.py +31 -4
- monoco/core/executor/__init__.py +38 -0
- monoco/core/executor/agent_action.py +254 -0
- monoco/core/executor/git_action.py +303 -0
- monoco/core/executor/im_action.py +309 -0
- monoco/core/executor/pytest_action.py +218 -0
- monoco/core/git.py +38 -0
- monoco/core/hooks/context.py +74 -13
- monoco/core/ingestion/__init__.py +20 -0
- monoco/core/ingestion/discovery.py +248 -0
- monoco/core/ingestion/watcher.py +343 -0
- monoco/core/ingestion/worker.py +436 -0
- monoco/core/loader.py +633 -0
- monoco/core/registry.py +34 -25
- monoco/core/router/__init__.py +55 -0
- monoco/core/router/action.py +341 -0
- monoco/core/router/router.py +392 -0
- monoco/core/scheduler/__init__.py +63 -0
- monoco/core/scheduler/base.py +152 -0
- monoco/core/scheduler/engines.py +175 -0
- monoco/core/scheduler/events.py +171 -0
- monoco/core/scheduler/local.py +377 -0
- monoco/core/skills.py +119 -80
- monoco/core/watcher/__init__.py +57 -0
- monoco/core/watcher/base.py +365 -0
- monoco/core/watcher/dropzone.py +152 -0
- monoco/core/watcher/issue.py +303 -0
- monoco/core/watcher/memo.py +200 -0
- monoco/core/watcher/task.py +238 -0
- monoco/daemon/app.py +77 -1
- monoco/daemon/commands.py +10 -0
- monoco/daemon/events.py +34 -0
- monoco/daemon/mailroom_service.py +196 -0
- monoco/daemon/models.py +1 -0
- monoco/daemon/scheduler.py +207 -0
- monoco/daemon/services.py +27 -58
- monoco/daemon/triggers.py +55 -0
- monoco/features/agent/__init__.py +25 -7
- monoco/features/agent/adapter.py +17 -7
- monoco/features/agent/cli.py +91 -57
- monoco/features/agent/engines.py +31 -170
- monoco/{core/resources/en/skills/monoco_core → features/agent/resources/en/skills/monoco_atom_core}/SKILL.md +2 -2
- monoco/features/agent/resources/en/skills/{flow_engineer → monoco_workflow_agent_engineer}/SKILL.md +2 -2
- monoco/features/agent/resources/en/skills/{flow_manager → monoco_workflow_agent_manager}/SKILL.md +2 -2
- monoco/features/agent/resources/en/skills/{flow_planner → monoco_workflow_agent_planner}/SKILL.md +2 -2
- monoco/features/agent/resources/en/skills/{flow_reviewer → monoco_workflow_agent_reviewer}/SKILL.md +2 -2
- monoco/features/agent/resources/{roles/role-engineer.yaml → zh/roles/monoco_role_engineer.yaml} +3 -3
- monoco/features/agent/resources/{roles/role-manager.yaml → zh/roles/monoco_role_manager.yaml} +8 -8
- monoco/features/agent/resources/{roles/role-planner.yaml → zh/roles/monoco_role_planner.yaml} +8 -8
- monoco/features/agent/resources/{roles/role-reviewer.yaml → zh/roles/monoco_role_reviewer.yaml} +8 -8
- monoco/{core/resources/zh/skills/monoco_core → features/agent/resources/zh/skills/monoco_atom_core}/SKILL.md +2 -2
- monoco/features/agent/resources/zh/skills/{flow_engineer → monoco_workflow_agent_engineer}/SKILL.md +2 -2
- monoco/features/agent/resources/zh/skills/{flow_manager → monoco_workflow_agent_manager}/SKILL.md +2 -2
- monoco/features/agent/resources/zh/skills/{flow_planner → monoco_workflow_agent_planner}/SKILL.md +2 -2
- monoco/features/agent/resources/zh/skills/{flow_reviewer → monoco_workflow_agent_reviewer}/SKILL.md +2 -2
- monoco/features/agent/worker.py +1 -1
- monoco/features/artifact/__init__.py +0 -0
- monoco/features/artifact/adapter.py +33 -0
- monoco/features/artifact/resources/zh/AGENTS.md +14 -0
- monoco/features/artifact/resources/zh/skills/monoco_atom_artifact/SKILL.md +278 -0
- monoco/features/glossary/adapter.py +18 -7
- monoco/features/glossary/resources/en/skills/{monoco_glossary → monoco_atom_glossary}/SKILL.md +2 -2
- monoco/features/glossary/resources/zh/skills/{monoco_glossary → monoco_atom_glossary}/SKILL.md +2 -2
- monoco/features/hooks/__init__.py +11 -0
- monoco/features/hooks/adapter.py +67 -0
- monoco/features/hooks/commands.py +309 -0
- monoco/features/hooks/core.py +441 -0
- monoco/features/hooks/resources/ADDING_HOOKS.md +234 -0
- monoco/features/i18n/adapter.py +18 -5
- monoco/features/i18n/core.py +482 -17
- monoco/features/i18n/resources/en/skills/{monoco_i18n → monoco_atom_i18n}/SKILL.md +2 -2
- monoco/features/i18n/resources/en/skills/{i18n_scan_workflow → monoco_workflow_i18n_scan}/SKILL.md +2 -2
- monoco/features/i18n/resources/zh/skills/{monoco_i18n → monoco_atom_i18n}/SKILL.md +2 -2
- monoco/features/i18n/resources/zh/skills/{i18n_scan_workflow → monoco_workflow_i18n_scan}/SKILL.md +2 -2
- monoco/features/issue/adapter.py +19 -6
- monoco/features/issue/commands.py +352 -20
- monoco/features/issue/core.py +475 -16
- monoco/features/issue/engine/machine.py +114 -4
- monoco/features/issue/linter.py +60 -5
- monoco/features/issue/models.py +2 -2
- monoco/features/issue/resources/en/AGENTS.md +109 -0
- monoco/features/issue/resources/en/skills/{monoco_issue → monoco_atom_issue}/SKILL.md +2 -2
- monoco/features/issue/resources/en/skills/{issue_create_workflow → monoco_workflow_issue_creation}/SKILL.md +2 -2
- monoco/features/issue/resources/en/skills/{issue_develop_workflow → monoco_workflow_issue_development}/SKILL.md +2 -2
- monoco/features/issue/resources/en/skills/{issue_lifecycle_workflow → monoco_workflow_issue_management}/SKILL.md +2 -2
- monoco/features/issue/resources/en/skills/{issue_refine_workflow → monoco_workflow_issue_refinement}/SKILL.md +2 -2
- monoco/features/issue/resources/hooks/post-checkout.sh +39 -0
- monoco/features/issue/resources/hooks/pre-commit.sh +41 -0
- monoco/features/issue/resources/hooks/pre-push.sh +35 -0
- monoco/features/issue/resources/zh/AGENTS.md +109 -0
- monoco/features/issue/resources/zh/skills/{monoco_issue → monoco_atom_issue_lifecycle}/SKILL.md +2 -2
- monoco/features/issue/resources/zh/skills/{issue_create_workflow → monoco_workflow_issue_creation}/SKILL.md +2 -2
- monoco/features/issue/resources/zh/skills/{issue_develop_workflow → monoco_workflow_issue_development}/SKILL.md +2 -2
- monoco/features/issue/resources/zh/skills/{issue_lifecycle_workflow → monoco_workflow_issue_management}/SKILL.md +2 -2
- monoco/features/issue/resources/zh/skills/{issue_refine_workflow → monoco_workflow_issue_refinement}/SKILL.md +2 -2
- monoco/features/issue/validator.py +101 -1
- monoco/features/memo/adapter.py +21 -8
- monoco/features/memo/cli.py +103 -10
- monoco/features/memo/core.py +178 -92
- monoco/features/memo/models.py +53 -0
- monoco/features/memo/resources/en/skills/{monoco_memo → monoco_atom_memo}/SKILL.md +2 -2
- monoco/features/memo/resources/en/skills/{note_processing_workflow → monoco_workflow_note_processing}/SKILL.md +2 -2
- monoco/features/memo/resources/zh/skills/{monoco_memo → monoco_atom_memo}/SKILL.md +2 -2
- monoco/features/memo/resources/zh/skills/{note_processing_workflow → monoco_workflow_note_processing}/SKILL.md +2 -2
- monoco/features/spike/adapter.py +18 -5
- monoco/features/spike/commands.py +5 -3
- monoco/features/spike/resources/en/skills/{monoco_spike → monoco_atom_spike}/SKILL.md +2 -2
- monoco/features/spike/resources/en/skills/{research_workflow → monoco_workflow_research}/SKILL.md +2 -2
- monoco/features/spike/resources/zh/skills/{monoco_spike → monoco_atom_spike}/SKILL.md +2 -2
- monoco/features/spike/resources/zh/skills/{research_workflow → monoco_workflow_research}/SKILL.md +2 -2
- monoco/main.py +38 -1
- {monoco_toolkit-0.3.10.dist-info → monoco_toolkit-0.3.12.dist-info}/METADATA +7 -1
- monoco_toolkit-0.3.12.dist-info/RECORD +202 -0
- monoco/features/agent/apoptosis.py +0 -44
- monoco/features/agent/manager.py +0 -91
- monoco/features/agent/session.py +0 -121
- monoco_toolkit-0.3.10.dist-info/RECORD +0 -156
- /monoco/{core → features/agent}/resources/en/AGENTS.md +0 -0
- /monoco/{core → features/agent}/resources/zh/AGENTS.md +0 -0
- {monoco_toolkit-0.3.10.dist-info → monoco_toolkit-0.3.12.dist-info}/WHEEL +0 -0
- {monoco_toolkit-0.3.10.dist-info → monoco_toolkit-0.3.12.dist-info}/entry_points.txt +0 -0
- {monoco_toolkit-0.3.10.dist-info → monoco_toolkit-0.3.12.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Monoco Mailroom - Automated Ingestion System
|
|
3
|
+
|
|
4
|
+
Provides environment discovery, file watching, and automated conversion
|
|
5
|
+
for document ingestion into the Monoco Artifact System.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .discovery import EnvironmentDiscovery, ConversionTool
|
|
9
|
+
from .worker import ConversionWorker, ConversionTask, ConversionResult
|
|
10
|
+
from .watcher import DropzoneWatcher, IngestionEvent
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"EnvironmentDiscovery",
|
|
14
|
+
"ConversionTool",
|
|
15
|
+
"ConversionWorker",
|
|
16
|
+
"ConversionTask",
|
|
17
|
+
"ConversionResult",
|
|
18
|
+
"DropzoneWatcher",
|
|
19
|
+
"IngestionEvent",
|
|
20
|
+
]
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Environment Discovery Module for Monoco Mailroom.
|
|
3
|
+
|
|
4
|
+
Automatically detects available document conversion tools in the system,
|
|
5
|
+
including LibreOffice (soffice), Pandoc, and PDF processing engines.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import shutil
|
|
11
|
+
import subprocess
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from enum import Enum
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ToolType(str, Enum):
|
|
19
|
+
"""Types of conversion tools supported."""
|
|
20
|
+
LIBREOFFICE = "libreoffice"
|
|
21
|
+
PANDOC = "pandoc"
|
|
22
|
+
PDF2TEXT = "pdf2text"
|
|
23
|
+
PDFTOHTML = "pdftohtml"
|
|
24
|
+
CUSTOM = "custom"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ToolCapability(str, Enum):
|
|
28
|
+
"""Capabilities of conversion tools."""
|
|
29
|
+
DOCX_TO_TEXT = "docx_to_text"
|
|
30
|
+
DOCX_TO_MD = "docx_to_md"
|
|
31
|
+
PDF_TO_TEXT = "pdf_to_text"
|
|
32
|
+
PDF_TO_HTML = "pdf_to_html"
|
|
33
|
+
ODT_TO_TEXT = "odt_to_text"
|
|
34
|
+
XLSX_TO_CSV = "xlsx_to_csv"
|
|
35
|
+
PPTX_TO_TEXT = "pptx_to_text"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class ConversionTool:
|
|
40
|
+
"""Represents a discovered conversion tool."""
|
|
41
|
+
name: str
|
|
42
|
+
tool_type: ToolType
|
|
43
|
+
executable_path: Path
|
|
44
|
+
version: str = "unknown"
|
|
45
|
+
capabilities: list[ToolCapability] = field(default_factory=list)
|
|
46
|
+
priority: int = 0 # Higher = preferred
|
|
47
|
+
|
|
48
|
+
def is_available(self) -> bool:
|
|
49
|
+
"""Check if the tool executable exists and is runnable."""
|
|
50
|
+
return self.executable_path.exists() and os.access(self.executable_path, os.X_OK)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class EnvironmentDiscovery:
|
|
54
|
+
"""
|
|
55
|
+
Discovers and manages document conversion tools in the system.
|
|
56
|
+
|
|
57
|
+
Automatically detects:
|
|
58
|
+
- LibreOffice (soffice) for Office document conversion
|
|
59
|
+
- Pandoc for markdown/text conversion
|
|
60
|
+
- PDF utilities (pdftotext, pdftohtml)
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
# Known executable names to search for
|
|
64
|
+
LIBREOFFICE_BINARIES = ["soffice", "libreoffice", "soffice.bin"]
|
|
65
|
+
PANDOC_BINARIES = ["pandoc"]
|
|
66
|
+
PDF_TOOLS = ["pdftotext", "pdftohtml", "pdf2txt.py"]
|
|
67
|
+
|
|
68
|
+
def __init__(self):
|
|
69
|
+
self._tools: dict[ToolType, list[ConversionTool]] = {}
|
|
70
|
+
self._discovered = False
|
|
71
|
+
|
|
72
|
+
def discover(self, force: bool = False) -> dict[ToolType, list[ConversionTool]]:
|
|
73
|
+
"""
|
|
74
|
+
Discover all available conversion tools.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
force: Force re-discovery even if already done
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Dictionary mapping tool types to lists of discovered tools
|
|
81
|
+
"""
|
|
82
|
+
if self._discovered and not force:
|
|
83
|
+
return self._tools
|
|
84
|
+
|
|
85
|
+
self._tools = {
|
|
86
|
+
ToolType.LIBREOFFICE: self._discover_libreoffice(),
|
|
87
|
+
ToolType.PANDOC: self._discover_pandoc(),
|
|
88
|
+
ToolType.PDF2TEXT: self._discover_pdf_tools(),
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
self._discovered = True
|
|
92
|
+
return self._tools
|
|
93
|
+
|
|
94
|
+
def _find_executable(self, names: list[str]) -> Optional[Path]:
|
|
95
|
+
"""Find the first available executable from a list of names."""
|
|
96
|
+
for name in names:
|
|
97
|
+
path = shutil.which(name)
|
|
98
|
+
if path:
|
|
99
|
+
return Path(path).resolve()
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
def _get_version(self, executable: Path, version_arg: str = "--version") -> str:
|
|
103
|
+
"""Get version string from an executable."""
|
|
104
|
+
try:
|
|
105
|
+
result = subprocess.run(
|
|
106
|
+
[str(executable), version_arg],
|
|
107
|
+
capture_output=True,
|
|
108
|
+
text=True,
|
|
109
|
+
timeout=5,
|
|
110
|
+
check=False,
|
|
111
|
+
)
|
|
112
|
+
# Extract version from first line of output
|
|
113
|
+
output = result.stdout or result.stderr
|
|
114
|
+
if output:
|
|
115
|
+
first_line = output.strip().split("\n")[0]
|
|
116
|
+
return first_line
|
|
117
|
+
except (subprocess.TimeoutExpired, OSError, ValueError):
|
|
118
|
+
pass
|
|
119
|
+
return "unknown"
|
|
120
|
+
|
|
121
|
+
def _discover_libreoffice(self) -> list[ConversionTool]:
|
|
122
|
+
"""Discover LibreOffice installation."""
|
|
123
|
+
tools = []
|
|
124
|
+
executable = self._find_executable(self.LIBREOFFICE_BINARIES)
|
|
125
|
+
|
|
126
|
+
if executable:
|
|
127
|
+
version = self._get_version(executable)
|
|
128
|
+
tools.append(ConversionTool(
|
|
129
|
+
name="LibreOffice",
|
|
130
|
+
tool_type=ToolType.LIBREOFFICE,
|
|
131
|
+
executable_path=executable,
|
|
132
|
+
version=version,
|
|
133
|
+
capabilities=[
|
|
134
|
+
ToolCapability.DOCX_TO_TEXT,
|
|
135
|
+
ToolCapability.DOCX_TO_MD,
|
|
136
|
+
ToolCapability.ODT_TO_TEXT,
|
|
137
|
+
ToolCapability.XLSX_TO_CSV,
|
|
138
|
+
ToolCapability.PPTX_TO_TEXT,
|
|
139
|
+
],
|
|
140
|
+
priority=100, # High priority for Office docs
|
|
141
|
+
))
|
|
142
|
+
|
|
143
|
+
return tools
|
|
144
|
+
|
|
145
|
+
def _discover_pandoc(self) -> list[ConversionTool]:
|
|
146
|
+
"""Discover Pandoc installation."""
|
|
147
|
+
tools = []
|
|
148
|
+
executable = self._find_executable(self.PANDOC_BINARIES)
|
|
149
|
+
|
|
150
|
+
if executable:
|
|
151
|
+
version = self._get_version(executable)
|
|
152
|
+
tools.append(ConversionTool(
|
|
153
|
+
name="Pandoc",
|
|
154
|
+
tool_type=ToolType.PANDOC,
|
|
155
|
+
executable_path=executable,
|
|
156
|
+
version=version,
|
|
157
|
+
capabilities=[
|
|
158
|
+
ToolCapability.DOCX_TO_MD,
|
|
159
|
+
ToolCapability.DOCX_TO_TEXT,
|
|
160
|
+
ToolCapability.ODT_TO_TEXT,
|
|
161
|
+
],
|
|
162
|
+
priority=90,
|
|
163
|
+
))
|
|
164
|
+
|
|
165
|
+
return tools
|
|
166
|
+
|
|
167
|
+
def _discover_pdf_tools(self) -> list[ConversionTool]:
|
|
168
|
+
"""Discover PDF conversion tools."""
|
|
169
|
+
tools = []
|
|
170
|
+
|
|
171
|
+
# pdftotext (from poppler-utils)
|
|
172
|
+
pdftotext = self._find_executable(["pdftotext"])
|
|
173
|
+
if pdftotext:
|
|
174
|
+
version = self._get_version(pdftotext, "-v")
|
|
175
|
+
tools.append(ConversionTool(
|
|
176
|
+
name="pdftotext",
|
|
177
|
+
tool_type=ToolType.PDF2TEXT,
|
|
178
|
+
executable_path=pdftotext,
|
|
179
|
+
version=version,
|
|
180
|
+
capabilities=[ToolCapability.PDF_TO_TEXT],
|
|
181
|
+
priority=100,
|
|
182
|
+
))
|
|
183
|
+
|
|
184
|
+
# pdftohtml
|
|
185
|
+
pdftohtml = self._find_executable(["pdftohtml"])
|
|
186
|
+
if pdftohtml:
|
|
187
|
+
version = self._get_version(pdftohtml, "-v")
|
|
188
|
+
tools.append(ConversionTool(
|
|
189
|
+
name="pdftohtml",
|
|
190
|
+
tool_type=ToolType.PDFTOHTML,
|
|
191
|
+
executable_path=pdftohtml,
|
|
192
|
+
version=version,
|
|
193
|
+
capabilities=[ToolCapability.PDF_TO_HTML],
|
|
194
|
+
priority=80,
|
|
195
|
+
))
|
|
196
|
+
|
|
197
|
+
return tools
|
|
198
|
+
|
|
199
|
+
def get_best_tool(self, capability: ToolCapability) -> Optional[ConversionTool]:
|
|
200
|
+
"""
|
|
201
|
+
Get the best available tool for a specific capability.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
capability: The required conversion capability
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Best matching ConversionTool or None
|
|
208
|
+
"""
|
|
209
|
+
if not self._discovered:
|
|
210
|
+
self.discover()
|
|
211
|
+
|
|
212
|
+
candidates = []
|
|
213
|
+
for tool_list in self._tools.values():
|
|
214
|
+
for tool in tool_list:
|
|
215
|
+
if capability in tool.capabilities:
|
|
216
|
+
candidates.append(tool)
|
|
217
|
+
|
|
218
|
+
if not candidates:
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
# Sort by priority (highest first)
|
|
222
|
+
candidates.sort(key=lambda t: t.priority, reverse=True)
|
|
223
|
+
return candidates[0]
|
|
224
|
+
|
|
225
|
+
def get_all_tools(self) -> list[ConversionTool]:
|
|
226
|
+
"""Get all discovered tools."""
|
|
227
|
+
if not self._discovered:
|
|
228
|
+
self.discover()
|
|
229
|
+
|
|
230
|
+
all_tools = []
|
|
231
|
+
for tool_list in self._tools.values():
|
|
232
|
+
all_tools.extend(tool_list)
|
|
233
|
+
return all_tools
|
|
234
|
+
|
|
235
|
+
def has_capability(self, capability: ToolCapability) -> bool:
|
|
236
|
+
"""Check if any tool supports the given capability."""
|
|
237
|
+
return self.get_best_tool(capability) is not None
|
|
238
|
+
|
|
239
|
+
def get_capabilities_summary(self) -> dict[str, bool]:
|
|
240
|
+
"""Get a summary of available capabilities."""
|
|
241
|
+
return {
|
|
242
|
+
cap.value: self.has_capability(cap)
|
|
243
|
+
for cap in ToolCapability
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
# Import os here to avoid issues with dataclass
|
|
248
|
+
import os
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dropzone Watcher for Monoco Mailroom.
|
|
3
|
+
|
|
4
|
+
Monitors dropzone directories for new files and triggers
|
|
5
|
+
automated ingestion workflows.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import logging
|
|
12
|
+
import uuid
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from datetime import datetime, timezone
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Optional, Callable, Any, Set
|
|
18
|
+
|
|
19
|
+
from watchdog.observers import Observer
|
|
20
|
+
from watchdog.events import FileSystemEventHandler, FileCreatedEvent, FileMovedEvent
|
|
21
|
+
|
|
22
|
+
from .worker import ConversionWorker, ConversionTask, ConversionResult, ConversionStatus
|
|
23
|
+
from ..artifacts.manager import ArtifactManager
|
|
24
|
+
from ..artifacts.models import ArtifactSourceType
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class IngestionEventType(str, Enum):
|
|
30
|
+
"""Types of ingestion events."""
|
|
31
|
+
FILE_DETECTED = "file_detected"
|
|
32
|
+
CONVERSION_STARTED = "conversion_started"
|
|
33
|
+
CONVERSION_COMPLETED = "conversion_completed"
|
|
34
|
+
CONVERSION_FAILED = "conversion_failed"
|
|
35
|
+
ARTIFACT_REGISTERED = "artifact_registered"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class IngestionEvent:
|
|
40
|
+
"""Event emitted during the ingestion process."""
|
|
41
|
+
event_type: IngestionEventType
|
|
42
|
+
file_path: Path
|
|
43
|
+
task_id: Optional[str] = None
|
|
44
|
+
artifact_id: Optional[str] = None
|
|
45
|
+
error_message: Optional[str] = None
|
|
46
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
47
|
+
timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DropzoneHandler(FileSystemEventHandler):
|
|
51
|
+
"""File system event handler for dropzone monitoring."""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
dropzone_path: Path,
|
|
56
|
+
on_file_detected: Callable[[Path], None],
|
|
57
|
+
supported_extensions: Optional[Set[str]] = None,
|
|
58
|
+
):
|
|
59
|
+
self.dropzone_path = Path(dropzone_path)
|
|
60
|
+
self.on_file_detected = on_file_detected
|
|
61
|
+
self.supported_extensions = supported_extensions or {
|
|
62
|
+
".docx", ".doc", ".pdf", ".odt",
|
|
63
|
+
".xlsx", ".xls", ".pptx", ".ppt",
|
|
64
|
+
}
|
|
65
|
+
self._processed_files: Set[Path] = set()
|
|
66
|
+
|
|
67
|
+
def on_created(self, event):
|
|
68
|
+
"""Handle file creation events."""
|
|
69
|
+
if event.is_directory:
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
file_path = Path(event.src_path)
|
|
73
|
+
if self._should_process(file_path):
|
|
74
|
+
self._processed_files.add(file_path.resolve())
|
|
75
|
+
self.on_file_detected(file_path)
|
|
76
|
+
|
|
77
|
+
def on_moved(self, event):
|
|
78
|
+
"""Handle file move events (e.g., atomic writes)."""
|
|
79
|
+
if event.is_directory:
|
|
80
|
+
return
|
|
81
|
+
|
|
82
|
+
file_path = Path(event.dest_path)
|
|
83
|
+
if self._should_process(file_path):
|
|
84
|
+
self._processed_files.add(file_path.resolve())
|
|
85
|
+
self.on_file_detected(file_path)
|
|
86
|
+
|
|
87
|
+
def _should_process(self, file_path: Path) -> bool:
|
|
88
|
+
"""Check if a file should be processed."""
|
|
89
|
+
# Skip hidden files
|
|
90
|
+
if file_path.name.startswith("."):
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
# Skip temporary files
|
|
94
|
+
if file_path.suffix in (".tmp", ".temp", ".part"):
|
|
95
|
+
return False
|
|
96
|
+
|
|
97
|
+
# Check extension
|
|
98
|
+
if file_path.suffix.lower() not in self.supported_extensions:
|
|
99
|
+
return False
|
|
100
|
+
|
|
101
|
+
# Skip already processed
|
|
102
|
+
if file_path.resolve() in self._processed_files:
|
|
103
|
+
return False
|
|
104
|
+
|
|
105
|
+
return True
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class DropzoneWatcher:
|
|
109
|
+
"""
|
|
110
|
+
Watches dropzone directories and orchestrates automated ingestion.
|
|
111
|
+
|
|
112
|
+
Features:
|
|
113
|
+
- Real-time file system monitoring
|
|
114
|
+
- Automatic conversion using ConversionWorker
|
|
115
|
+
- Artifact registration with ArtifactManager
|
|
116
|
+
- Event callbacks for integration
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
def __init__(
|
|
120
|
+
self,
|
|
121
|
+
dropzone_path: Path,
|
|
122
|
+
artifact_manager: ArtifactManager,
|
|
123
|
+
conversion_worker: Optional[ConversionWorker] = None,
|
|
124
|
+
output_dir: Optional[Path] = None,
|
|
125
|
+
process_existing: bool = False,
|
|
126
|
+
):
|
|
127
|
+
"""
|
|
128
|
+
Initialize the dropzone watcher.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
dropzone_path: Directory to monitor for new files
|
|
132
|
+
artifact_manager: ArtifactManager for registering converted files
|
|
133
|
+
conversion_worker: ConversionWorker for document conversion
|
|
134
|
+
output_dir: Directory for converted files (default: dropzone/converted)
|
|
135
|
+
process_existing: Whether to process files already in dropzone
|
|
136
|
+
"""
|
|
137
|
+
self.dropzone_path = Path(dropzone_path)
|
|
138
|
+
self.artifact_manager = artifact_manager
|
|
139
|
+
self.conversion_worker = conversion_worker or ConversionWorker()
|
|
140
|
+
self.output_dir = output_dir or (self.dropzone_path / "converted")
|
|
141
|
+
self.process_existing = process_existing
|
|
142
|
+
|
|
143
|
+
# Event callbacks
|
|
144
|
+
self._on_event: Optional[Callable[[IngestionEvent], None]] = None
|
|
145
|
+
|
|
146
|
+
# State
|
|
147
|
+
self._observer: Optional[Observer] = None
|
|
148
|
+
self._running = False
|
|
149
|
+
self._pending_tasks: dict[str, asyncio.Task] = {}
|
|
150
|
+
|
|
151
|
+
def set_event_callback(self, callback: Callable[[IngestionEvent], None]) -> None:
|
|
152
|
+
"""Set callback for ingestion events."""
|
|
153
|
+
self._on_event = callback
|
|
154
|
+
|
|
155
|
+
def _emit_event(self, event: IngestionEvent) -> None:
|
|
156
|
+
"""Emit an ingestion event."""
|
|
157
|
+
if self._on_event:
|
|
158
|
+
try:
|
|
159
|
+
self._on_event(event)
|
|
160
|
+
except Exception:
|
|
161
|
+
pass
|
|
162
|
+
|
|
163
|
+
def start(self) -> None:
|
|
164
|
+
"""Start watching the dropzone directory."""
|
|
165
|
+
if self._running:
|
|
166
|
+
return
|
|
167
|
+
|
|
168
|
+
# Ensure directories exist
|
|
169
|
+
self.dropzone_path.mkdir(parents=True, exist_ok=True)
|
|
170
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
171
|
+
|
|
172
|
+
# Set up file system observer
|
|
173
|
+
self._handler = DropzoneHandler(
|
|
174
|
+
self.dropzone_path,
|
|
175
|
+
self._on_file_detected,
|
|
176
|
+
set(self.conversion_worker.get_supported_extensions()),
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
self._observer = Observer()
|
|
180
|
+
self._observer.schedule(self._handler, str(self.dropzone_path), recursive=False)
|
|
181
|
+
self._observer.start()
|
|
182
|
+
|
|
183
|
+
self._running = True
|
|
184
|
+
logger.info(f"Started watching dropzone: {self.dropzone_path}")
|
|
185
|
+
|
|
186
|
+
# Process existing files if requested
|
|
187
|
+
if self.process_existing:
|
|
188
|
+
self._scan_existing_files()
|
|
189
|
+
|
|
190
|
+
def stop(self) -> None:
|
|
191
|
+
"""Stop watching the dropzone directory."""
|
|
192
|
+
if not self._running:
|
|
193
|
+
return
|
|
194
|
+
|
|
195
|
+
self._running = False
|
|
196
|
+
|
|
197
|
+
# Cancel pending tasks
|
|
198
|
+
for task in self._pending_tasks.values():
|
|
199
|
+
task.cancel()
|
|
200
|
+
self._pending_tasks.clear()
|
|
201
|
+
|
|
202
|
+
# Stop observer
|
|
203
|
+
if self._observer:
|
|
204
|
+
self._observer.stop()
|
|
205
|
+
self._observer.join()
|
|
206
|
+
self._observer = None
|
|
207
|
+
|
|
208
|
+
logger.info(f"Stopped watching dropzone: {self.dropzone_path}")
|
|
209
|
+
|
|
210
|
+
def _scan_existing_files(self) -> None:
|
|
211
|
+
"""Scan and process existing files in dropzone."""
|
|
212
|
+
for file_path in self.dropzone_path.iterdir():
|
|
213
|
+
if file_path.is_file() and self._handler._should_process(file_path):
|
|
214
|
+
self._on_file_detected(file_path)
|
|
215
|
+
|
|
216
|
+
def _on_file_detected(self, file_path: Path) -> None:
|
|
217
|
+
"""Handle newly detected file."""
|
|
218
|
+
logger.info(f"File detected: {file_path}")
|
|
219
|
+
|
|
220
|
+
self._emit_event(IngestionEvent(
|
|
221
|
+
event_type=IngestionEventType.FILE_DETECTED,
|
|
222
|
+
file_path=file_path,
|
|
223
|
+
))
|
|
224
|
+
|
|
225
|
+
# Create async task for processing
|
|
226
|
+
task_id = str(uuid.uuid4())
|
|
227
|
+
asyncio.create_task(self._process_file(file_path, task_id))
|
|
228
|
+
|
|
229
|
+
async def _process_file(self, file_path: Path, task_id: str) -> None:
|
|
230
|
+
"""Process a detected file through the ingestion pipeline."""
|
|
231
|
+
try:
|
|
232
|
+
# Step 1: Check if conversion is needed/possible
|
|
233
|
+
if not self.conversion_worker.can_convert(file_path):
|
|
234
|
+
logger.warning(f"Cannot convert file: {file_path}")
|
|
235
|
+
self._emit_event(IngestionEvent(
|
|
236
|
+
event_type=IngestionEventType.CONVERSION_FAILED,
|
|
237
|
+
file_path=file_path,
|
|
238
|
+
task_id=task_id,
|
|
239
|
+
error_message="No conversion tool available for this file type",
|
|
240
|
+
))
|
|
241
|
+
return
|
|
242
|
+
|
|
243
|
+
# Step 2: Create conversion task
|
|
244
|
+
conversion_task = ConversionTask(
|
|
245
|
+
task_id=task_id,
|
|
246
|
+
source_path=file_path,
|
|
247
|
+
target_format="txt",
|
|
248
|
+
output_dir=self.output_dir,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
self._emit_event(IngestionEvent(
|
|
252
|
+
event_type=IngestionEventType.CONVERSION_STARTED,
|
|
253
|
+
file_path=file_path,
|
|
254
|
+
task_id=task_id,
|
|
255
|
+
))
|
|
256
|
+
|
|
257
|
+
# Step 3: Perform conversion
|
|
258
|
+
result = await self.conversion_worker.submit(conversion_task)
|
|
259
|
+
|
|
260
|
+
if result.status != ConversionStatus.SUCCESS:
|
|
261
|
+
logger.error(f"Conversion failed for {file_path}: {result.error_message}")
|
|
262
|
+
self._emit_event(IngestionEvent(
|
|
263
|
+
event_type=IngestionEventType.CONVERSION_FAILED,
|
|
264
|
+
file_path=file_path,
|
|
265
|
+
task_id=task_id,
|
|
266
|
+
error_message=result.error_message,
|
|
267
|
+
))
|
|
268
|
+
return
|
|
269
|
+
|
|
270
|
+
self._emit_event(IngestionEvent(
|
|
271
|
+
event_type=IngestionEventType.CONVERSION_COMPLETED,
|
|
272
|
+
file_path=file_path,
|
|
273
|
+
task_id=task_id,
|
|
274
|
+
metadata={
|
|
275
|
+
"output_path": str(result.output_path),
|
|
276
|
+
"processing_time_ms": result.processing_time_ms,
|
|
277
|
+
},
|
|
278
|
+
))
|
|
279
|
+
|
|
280
|
+
# Step 4: Register as artifact
|
|
281
|
+
if result.output_path and result.output_path.exists():
|
|
282
|
+
artifact_meta = self._register_artifact(
|
|
283
|
+
result.output_path,
|
|
284
|
+
source_file=file_path,
|
|
285
|
+
conversion_metadata=result.metadata,
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
self._emit_event(IngestionEvent(
|
|
289
|
+
event_type=IngestionEventType.ARTIFACT_REGISTERED,
|
|
290
|
+
file_path=file_path,
|
|
291
|
+
task_id=task_id,
|
|
292
|
+
artifact_id=artifact_meta.artifact_id,
|
|
293
|
+
metadata={
|
|
294
|
+
"content_hash": artifact_meta.content_hash,
|
|
295
|
+
"content_type": artifact_meta.content_type,
|
|
296
|
+
},
|
|
297
|
+
))
|
|
298
|
+
|
|
299
|
+
logger.info(f"Successfully ingested {file_path} as artifact {artifact_meta.artifact_id}")
|
|
300
|
+
|
|
301
|
+
except Exception as e:
|
|
302
|
+
logger.exception(f"Error processing file {file_path}")
|
|
303
|
+
self._emit_event(IngestionEvent(
|
|
304
|
+
event_type=IngestionEventType.CONVERSION_FAILED,
|
|
305
|
+
file_path=file_path,
|
|
306
|
+
task_id=task_id,
|
|
307
|
+
error_message=str(e),
|
|
308
|
+
))
|
|
309
|
+
|
|
310
|
+
def _register_artifact(
|
|
311
|
+
self,
|
|
312
|
+
file_path: Path,
|
|
313
|
+
source_file: Path,
|
|
314
|
+
conversion_metadata: dict[str, Any],
|
|
315
|
+
) -> Any:
|
|
316
|
+
"""Register converted file as an artifact."""
|
|
317
|
+
metadata = {
|
|
318
|
+
"source_file": str(source_file),
|
|
319
|
+
"original_filename": source_file.name,
|
|
320
|
+
**conversion_metadata,
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
return self.artifact_manager.store_file(
|
|
324
|
+
file_path=file_path,
|
|
325
|
+
source_type=ArtifactSourceType.IMPORTED,
|
|
326
|
+
content_type="text/plain",
|
|
327
|
+
tags=["mailroom", "converted", source_file.suffix.lower().lstrip(".")],
|
|
328
|
+
metadata=metadata,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
def is_running(self) -> bool:
|
|
332
|
+
"""Check if the watcher is currently running."""
|
|
333
|
+
return self._running
|
|
334
|
+
|
|
335
|
+
def get_stats(self) -> dict[str, Any]:
|
|
336
|
+
"""Get watcher statistics."""
|
|
337
|
+
return {
|
|
338
|
+
"running": self._running,
|
|
339
|
+
"dropzone_path": str(self.dropzone_path),
|
|
340
|
+
"output_dir": str(self.output_dir),
|
|
341
|
+
"pending_tasks": len(self._pending_tasks),
|
|
342
|
+
"supported_extensions": list(self.conversion_worker.get_supported_extensions()),
|
|
343
|
+
}
|