monoco-toolkit 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. monoco/__main__.py +8 -0
  2. monoco/core/artifacts/__init__.py +16 -0
  3. monoco/core/artifacts/manager.py +575 -0
  4. monoco/core/artifacts/models.py +161 -0
  5. monoco/core/automation/__init__.py +51 -0
  6. monoco/core/automation/config.py +338 -0
  7. monoco/core/automation/field_watcher.py +296 -0
  8. monoco/core/automation/handlers.py +723 -0
  9. monoco/core/config.py +31 -4
  10. monoco/core/executor/__init__.py +38 -0
  11. monoco/core/executor/agent_action.py +254 -0
  12. monoco/core/executor/git_action.py +303 -0
  13. monoco/core/executor/im_action.py +309 -0
  14. monoco/core/executor/pytest_action.py +218 -0
  15. monoco/core/git.py +38 -0
  16. monoco/core/hooks/context.py +74 -13
  17. monoco/core/ingestion/__init__.py +20 -0
  18. monoco/core/ingestion/discovery.py +248 -0
  19. monoco/core/ingestion/watcher.py +343 -0
  20. monoco/core/ingestion/worker.py +436 -0
  21. monoco/core/loader.py +633 -0
  22. monoco/core/registry.py +34 -25
  23. monoco/core/router/__init__.py +55 -0
  24. monoco/core/router/action.py +341 -0
  25. monoco/core/router/router.py +392 -0
  26. monoco/core/scheduler/__init__.py +63 -0
  27. monoco/core/scheduler/base.py +152 -0
  28. monoco/core/scheduler/engines.py +175 -0
  29. monoco/core/scheduler/events.py +171 -0
  30. monoco/core/scheduler/local.py +377 -0
  31. monoco/core/skills.py +119 -80
  32. monoco/core/watcher/__init__.py +57 -0
  33. monoco/core/watcher/base.py +365 -0
  34. monoco/core/watcher/dropzone.py +152 -0
  35. monoco/core/watcher/issue.py +303 -0
  36. monoco/core/watcher/memo.py +200 -0
  37. monoco/core/watcher/task.py +238 -0
  38. monoco/daemon/app.py +77 -1
  39. monoco/daemon/commands.py +10 -0
  40. monoco/daemon/events.py +34 -0
  41. monoco/daemon/mailroom_service.py +196 -0
  42. monoco/daemon/models.py +1 -0
  43. monoco/daemon/scheduler.py +207 -0
  44. monoco/daemon/services.py +27 -58
  45. monoco/daemon/triggers.py +55 -0
  46. monoco/features/agent/__init__.py +25 -7
  47. monoco/features/agent/adapter.py +17 -7
  48. monoco/features/agent/cli.py +91 -57
  49. monoco/features/agent/engines.py +31 -170
  50. monoco/{core/resources/en/skills/monoco_core → features/agent/resources/en/skills/monoco_atom_core}/SKILL.md +2 -2
  51. monoco/features/agent/resources/en/skills/{flow_engineer → monoco_workflow_agent_engineer}/SKILL.md +2 -2
  52. monoco/features/agent/resources/en/skills/{flow_manager → monoco_workflow_agent_manager}/SKILL.md +2 -2
  53. monoco/features/agent/resources/en/skills/{flow_planner → monoco_workflow_agent_planner}/SKILL.md +2 -2
  54. monoco/features/agent/resources/en/skills/{flow_reviewer → monoco_workflow_agent_reviewer}/SKILL.md +2 -2
  55. monoco/features/agent/resources/{roles/role-engineer.yaml → zh/roles/monoco_role_engineer.yaml} +3 -3
  56. monoco/features/agent/resources/{roles/role-manager.yaml → zh/roles/monoco_role_manager.yaml} +8 -8
  57. monoco/features/agent/resources/{roles/role-planner.yaml → zh/roles/monoco_role_planner.yaml} +8 -8
  58. monoco/features/agent/resources/{roles/role-reviewer.yaml → zh/roles/monoco_role_reviewer.yaml} +8 -8
  59. monoco/{core/resources/zh/skills/monoco_core → features/agent/resources/zh/skills/monoco_atom_core}/SKILL.md +2 -2
  60. monoco/features/agent/resources/zh/skills/{flow_engineer → monoco_workflow_agent_engineer}/SKILL.md +2 -2
  61. monoco/features/agent/resources/zh/skills/{flow_manager → monoco_workflow_agent_manager}/SKILL.md +2 -2
  62. monoco/features/agent/resources/zh/skills/{flow_planner → monoco_workflow_agent_planner}/SKILL.md +2 -2
  63. monoco/features/agent/resources/zh/skills/{flow_reviewer → monoco_workflow_agent_reviewer}/SKILL.md +2 -2
  64. monoco/features/agent/worker.py +1 -1
  65. monoco/features/artifact/__init__.py +0 -0
  66. monoco/features/artifact/adapter.py +33 -0
  67. monoco/features/artifact/resources/zh/AGENTS.md +14 -0
  68. monoco/features/artifact/resources/zh/skills/monoco_atom_artifact/SKILL.md +278 -0
  69. monoco/features/glossary/adapter.py +18 -7
  70. monoco/features/glossary/resources/en/skills/{monoco_glossary → monoco_atom_glossary}/SKILL.md +2 -2
  71. monoco/features/glossary/resources/zh/skills/{monoco_glossary → monoco_atom_glossary}/SKILL.md +2 -2
  72. monoco/features/hooks/__init__.py +11 -0
  73. monoco/features/hooks/adapter.py +67 -0
  74. monoco/features/hooks/commands.py +309 -0
  75. monoco/features/hooks/core.py +441 -0
  76. monoco/features/hooks/resources/ADDING_HOOKS.md +234 -0
  77. monoco/features/i18n/adapter.py +18 -5
  78. monoco/features/i18n/core.py +482 -17
  79. monoco/features/i18n/resources/en/skills/{monoco_i18n → monoco_atom_i18n}/SKILL.md +2 -2
  80. monoco/features/i18n/resources/en/skills/{i18n_scan_workflow → monoco_workflow_i18n_scan}/SKILL.md +2 -2
  81. monoco/features/i18n/resources/zh/skills/{monoco_i18n → monoco_atom_i18n}/SKILL.md +2 -2
  82. monoco/features/i18n/resources/zh/skills/{i18n_scan_workflow → monoco_workflow_i18n_scan}/SKILL.md +2 -2
  83. monoco/features/issue/adapter.py +19 -6
  84. monoco/features/issue/commands.py +352 -20
  85. monoco/features/issue/core.py +475 -16
  86. monoco/features/issue/engine/machine.py +114 -4
  87. monoco/features/issue/linter.py +60 -5
  88. monoco/features/issue/models.py +2 -2
  89. monoco/features/issue/resources/en/AGENTS.md +109 -0
  90. monoco/features/issue/resources/en/skills/{monoco_issue → monoco_atom_issue}/SKILL.md +2 -2
  91. monoco/features/issue/resources/en/skills/{issue_create_workflow → monoco_workflow_issue_creation}/SKILL.md +2 -2
  92. monoco/features/issue/resources/en/skills/{issue_develop_workflow → monoco_workflow_issue_development}/SKILL.md +2 -2
  93. monoco/features/issue/resources/en/skills/{issue_lifecycle_workflow → monoco_workflow_issue_management}/SKILL.md +2 -2
  94. monoco/features/issue/resources/en/skills/{issue_refine_workflow → monoco_workflow_issue_refinement}/SKILL.md +2 -2
  95. monoco/features/issue/resources/hooks/post-checkout.sh +39 -0
  96. monoco/features/issue/resources/hooks/pre-commit.sh +41 -0
  97. monoco/features/issue/resources/hooks/pre-push.sh +35 -0
  98. monoco/features/issue/resources/zh/AGENTS.md +109 -0
  99. monoco/features/issue/resources/zh/skills/{monoco_issue → monoco_atom_issue_lifecycle}/SKILL.md +2 -2
  100. monoco/features/issue/resources/zh/skills/{issue_create_workflow → monoco_workflow_issue_creation}/SKILL.md +2 -2
  101. monoco/features/issue/resources/zh/skills/{issue_develop_workflow → monoco_workflow_issue_development}/SKILL.md +2 -2
  102. monoco/features/issue/resources/zh/skills/{issue_lifecycle_workflow → monoco_workflow_issue_management}/SKILL.md +2 -2
  103. monoco/features/issue/resources/zh/skills/{issue_refine_workflow → monoco_workflow_issue_refinement}/SKILL.md +2 -2
  104. monoco/features/issue/validator.py +101 -1
  105. monoco/features/memo/adapter.py +21 -8
  106. monoco/features/memo/cli.py +103 -10
  107. monoco/features/memo/core.py +178 -92
  108. monoco/features/memo/models.py +53 -0
  109. monoco/features/memo/resources/en/skills/{monoco_memo → monoco_atom_memo}/SKILL.md +2 -2
  110. monoco/features/memo/resources/en/skills/{note_processing_workflow → monoco_workflow_note_processing}/SKILL.md +2 -2
  111. monoco/features/memo/resources/zh/skills/{monoco_memo → monoco_atom_memo}/SKILL.md +2 -2
  112. monoco/features/memo/resources/zh/skills/{note_processing_workflow → monoco_workflow_note_processing}/SKILL.md +2 -2
  113. monoco/features/spike/adapter.py +18 -5
  114. monoco/features/spike/commands.py +5 -3
  115. monoco/features/spike/resources/en/skills/{monoco_spike → monoco_atom_spike}/SKILL.md +2 -2
  116. monoco/features/spike/resources/en/skills/{research_workflow → monoco_workflow_research}/SKILL.md +2 -2
  117. monoco/features/spike/resources/zh/skills/{monoco_spike → monoco_atom_spike}/SKILL.md +2 -2
  118. monoco/features/spike/resources/zh/skills/{research_workflow → monoco_workflow_research}/SKILL.md +2 -2
  119. monoco/main.py +38 -1
  120. {monoco_toolkit-0.3.10.dist-info → monoco_toolkit-0.3.12.dist-info}/METADATA +7 -1
  121. monoco_toolkit-0.3.12.dist-info/RECORD +202 -0
  122. monoco/features/agent/apoptosis.py +0 -44
  123. monoco/features/agent/manager.py +0 -91
  124. monoco/features/agent/session.py +0 -121
  125. monoco_toolkit-0.3.10.dist-info/RECORD +0 -156
  126. /monoco/{core → features/agent}/resources/en/AGENTS.md +0 -0
  127. /monoco/{core → features/agent}/resources/zh/AGENTS.md +0 -0
  128. {monoco_toolkit-0.3.10.dist-info → monoco_toolkit-0.3.12.dist-info}/WHEEL +0 -0
  129. {monoco_toolkit-0.3.10.dist-info → monoco_toolkit-0.3.12.dist-info}/entry_points.txt +0 -0
  130. {monoco_toolkit-0.3.10.dist-info → monoco_toolkit-0.3.12.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,20 @@
1
+ """
2
+ Monoco Mailroom - Automated Ingestion System
3
+
4
+ Provides environment discovery, file watching, and automated conversion
5
+ for document ingestion into the Monoco Artifact System.
6
+ """
7
+
8
+ from .discovery import EnvironmentDiscovery, ConversionTool
9
+ from .worker import ConversionWorker, ConversionTask, ConversionResult
10
+ from .watcher import DropzoneWatcher, IngestionEvent
11
+
12
+ __all__ = [
13
+ "EnvironmentDiscovery",
14
+ "ConversionTool",
15
+ "ConversionWorker",
16
+ "ConversionTask",
17
+ "ConversionResult",
18
+ "DropzoneWatcher",
19
+ "IngestionEvent",
20
+ ]
@@ -0,0 +1,248 @@
1
+ """
2
+ Environment Discovery Module for Monoco Mailroom.
3
+
4
+ Automatically detects available document conversion tools in the system,
5
+ including LibreOffice (soffice), Pandoc, and PDF processing engines.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import shutil
11
+ import subprocess
12
+ from dataclasses import dataclass, field
13
+ from enum import Enum
14
+ from pathlib import Path
15
+ from typing import Optional
16
+
17
+
18
+ class ToolType(str, Enum):
19
+ """Types of conversion tools supported."""
20
+ LIBREOFFICE = "libreoffice"
21
+ PANDOC = "pandoc"
22
+ PDF2TEXT = "pdf2text"
23
+ PDFTOHTML = "pdftohtml"
24
+ CUSTOM = "custom"
25
+
26
+
27
+ class ToolCapability(str, Enum):
28
+ """Capabilities of conversion tools."""
29
+ DOCX_TO_TEXT = "docx_to_text"
30
+ DOCX_TO_MD = "docx_to_md"
31
+ PDF_TO_TEXT = "pdf_to_text"
32
+ PDF_TO_HTML = "pdf_to_html"
33
+ ODT_TO_TEXT = "odt_to_text"
34
+ XLSX_TO_CSV = "xlsx_to_csv"
35
+ PPTX_TO_TEXT = "pptx_to_text"
36
+
37
+
38
+ @dataclass
39
+ class ConversionTool:
40
+ """Represents a discovered conversion tool."""
41
+ name: str
42
+ tool_type: ToolType
43
+ executable_path: Path
44
+ version: str = "unknown"
45
+ capabilities: list[ToolCapability] = field(default_factory=list)
46
+ priority: int = 0 # Higher = preferred
47
+
48
+ def is_available(self) -> bool:
49
+ """Check if the tool executable exists and is runnable."""
50
+ return self.executable_path.exists() and os.access(self.executable_path, os.X_OK)
51
+
52
+
53
+ class EnvironmentDiscovery:
54
+ """
55
+ Discovers and manages document conversion tools in the system.
56
+
57
+ Automatically detects:
58
+ - LibreOffice (soffice) for Office document conversion
59
+ - Pandoc for markdown/text conversion
60
+ - PDF utilities (pdftotext, pdftohtml)
61
+ """
62
+
63
+ # Known executable names to search for
64
+ LIBREOFFICE_BINARIES = ["soffice", "libreoffice", "soffice.bin"]
65
+ PANDOC_BINARIES = ["pandoc"]
66
+ PDF_TOOLS = ["pdftotext", "pdftohtml", "pdf2txt.py"]
67
+
68
+ def __init__(self):
69
+ self._tools: dict[ToolType, list[ConversionTool]] = {}
70
+ self._discovered = False
71
+
72
+ def discover(self, force: bool = False) -> dict[ToolType, list[ConversionTool]]:
73
+ """
74
+ Discover all available conversion tools.
75
+
76
+ Args:
77
+ force: Force re-discovery even if already done
78
+
79
+ Returns:
80
+ Dictionary mapping tool types to lists of discovered tools
81
+ """
82
+ if self._discovered and not force:
83
+ return self._tools
84
+
85
+ self._tools = {
86
+ ToolType.LIBREOFFICE: self._discover_libreoffice(),
87
+ ToolType.PANDOC: self._discover_pandoc(),
88
+ ToolType.PDF2TEXT: self._discover_pdf_tools(),
89
+ }
90
+
91
+ self._discovered = True
92
+ return self._tools
93
+
94
+ def _find_executable(self, names: list[str]) -> Optional[Path]:
95
+ """Find the first available executable from a list of names."""
96
+ for name in names:
97
+ path = shutil.which(name)
98
+ if path:
99
+ return Path(path).resolve()
100
+ return None
101
+
102
+ def _get_version(self, executable: Path, version_arg: str = "--version") -> str:
103
+ """Get version string from an executable."""
104
+ try:
105
+ result = subprocess.run(
106
+ [str(executable), version_arg],
107
+ capture_output=True,
108
+ text=True,
109
+ timeout=5,
110
+ check=False,
111
+ )
112
+ # Extract version from first line of output
113
+ output = result.stdout or result.stderr
114
+ if output:
115
+ first_line = output.strip().split("\n")[0]
116
+ return first_line
117
+ except (subprocess.TimeoutExpired, OSError, ValueError):
118
+ pass
119
+ return "unknown"
120
+
121
+ def _discover_libreoffice(self) -> list[ConversionTool]:
122
+ """Discover LibreOffice installation."""
123
+ tools = []
124
+ executable = self._find_executable(self.LIBREOFFICE_BINARIES)
125
+
126
+ if executable:
127
+ version = self._get_version(executable)
128
+ tools.append(ConversionTool(
129
+ name="LibreOffice",
130
+ tool_type=ToolType.LIBREOFFICE,
131
+ executable_path=executable,
132
+ version=version,
133
+ capabilities=[
134
+ ToolCapability.DOCX_TO_TEXT,
135
+ ToolCapability.DOCX_TO_MD,
136
+ ToolCapability.ODT_TO_TEXT,
137
+ ToolCapability.XLSX_TO_CSV,
138
+ ToolCapability.PPTX_TO_TEXT,
139
+ ],
140
+ priority=100, # High priority for Office docs
141
+ ))
142
+
143
+ return tools
144
+
145
+ def _discover_pandoc(self) -> list[ConversionTool]:
146
+ """Discover Pandoc installation."""
147
+ tools = []
148
+ executable = self._find_executable(self.PANDOC_BINARIES)
149
+
150
+ if executable:
151
+ version = self._get_version(executable)
152
+ tools.append(ConversionTool(
153
+ name="Pandoc",
154
+ tool_type=ToolType.PANDOC,
155
+ executable_path=executable,
156
+ version=version,
157
+ capabilities=[
158
+ ToolCapability.DOCX_TO_MD,
159
+ ToolCapability.DOCX_TO_TEXT,
160
+ ToolCapability.ODT_TO_TEXT,
161
+ ],
162
+ priority=90,
163
+ ))
164
+
165
+ return tools
166
+
167
+ def _discover_pdf_tools(self) -> list[ConversionTool]:
168
+ """Discover PDF conversion tools."""
169
+ tools = []
170
+
171
+ # pdftotext (from poppler-utils)
172
+ pdftotext = self._find_executable(["pdftotext"])
173
+ if pdftotext:
174
+ version = self._get_version(pdftotext, "-v")
175
+ tools.append(ConversionTool(
176
+ name="pdftotext",
177
+ tool_type=ToolType.PDF2TEXT,
178
+ executable_path=pdftotext,
179
+ version=version,
180
+ capabilities=[ToolCapability.PDF_TO_TEXT],
181
+ priority=100,
182
+ ))
183
+
184
+ # pdftohtml
185
+ pdftohtml = self._find_executable(["pdftohtml"])
186
+ if pdftohtml:
187
+ version = self._get_version(pdftohtml, "-v")
188
+ tools.append(ConversionTool(
189
+ name="pdftohtml",
190
+ tool_type=ToolType.PDFTOHTML,
191
+ executable_path=pdftohtml,
192
+ version=version,
193
+ capabilities=[ToolCapability.PDF_TO_HTML],
194
+ priority=80,
195
+ ))
196
+
197
+ return tools
198
+
199
+ def get_best_tool(self, capability: ToolCapability) -> Optional[ConversionTool]:
200
+ """
201
+ Get the best available tool for a specific capability.
202
+
203
+ Args:
204
+ capability: The required conversion capability
205
+
206
+ Returns:
207
+ Best matching ConversionTool or None
208
+ """
209
+ if not self._discovered:
210
+ self.discover()
211
+
212
+ candidates = []
213
+ for tool_list in self._tools.values():
214
+ for tool in tool_list:
215
+ if capability in tool.capabilities:
216
+ candidates.append(tool)
217
+
218
+ if not candidates:
219
+ return None
220
+
221
+ # Sort by priority (highest first)
222
+ candidates.sort(key=lambda t: t.priority, reverse=True)
223
+ return candidates[0]
224
+
225
+ def get_all_tools(self) -> list[ConversionTool]:
226
+ """Get all discovered tools."""
227
+ if not self._discovered:
228
+ self.discover()
229
+
230
+ all_tools = []
231
+ for tool_list in self._tools.values():
232
+ all_tools.extend(tool_list)
233
+ return all_tools
234
+
235
+ def has_capability(self, capability: ToolCapability) -> bool:
236
+ """Check if any tool supports the given capability."""
237
+ return self.get_best_tool(capability) is not None
238
+
239
+ def get_capabilities_summary(self) -> dict[str, bool]:
240
+ """Get a summary of available capabilities."""
241
+ return {
242
+ cap.value: self.has_capability(cap)
243
+ for cap in ToolCapability
244
+ }
245
+
246
+
247
+ # Import os here to avoid issues with dataclass
248
+ import os
@@ -0,0 +1,343 @@
1
+ """
2
+ Dropzone Watcher for Monoco Mailroom.
3
+
4
+ Monitors dropzone directories for new files and triggers
5
+ automated ingestion workflows.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import asyncio
11
+ import logging
12
+ import uuid
13
+ from dataclasses import dataclass, field
14
+ from datetime import datetime, timezone
15
+ from enum import Enum
16
+ from pathlib import Path
17
+ from typing import Optional, Callable, Any, Set
18
+
19
+ from watchdog.observers import Observer
20
+ from watchdog.events import FileSystemEventHandler, FileCreatedEvent, FileMovedEvent
21
+
22
+ from .worker import ConversionWorker, ConversionTask, ConversionResult, ConversionStatus
23
+ from ..artifacts.manager import ArtifactManager
24
+ from ..artifacts.models import ArtifactSourceType
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class IngestionEventType(str, Enum):
30
+ """Types of ingestion events."""
31
+ FILE_DETECTED = "file_detected"
32
+ CONVERSION_STARTED = "conversion_started"
33
+ CONVERSION_COMPLETED = "conversion_completed"
34
+ CONVERSION_FAILED = "conversion_failed"
35
+ ARTIFACT_REGISTERED = "artifact_registered"
36
+
37
+
38
+ @dataclass
39
+ class IngestionEvent:
40
+ """Event emitted during the ingestion process."""
41
+ event_type: IngestionEventType
42
+ file_path: Path
43
+ task_id: Optional[str] = None
44
+ artifact_id: Optional[str] = None
45
+ error_message: Optional[str] = None
46
+ metadata: dict[str, Any] = field(default_factory=dict)
47
+ timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
48
+
49
+
50
+ class DropzoneHandler(FileSystemEventHandler):
51
+ """File system event handler for dropzone monitoring."""
52
+
53
+ def __init__(
54
+ self,
55
+ dropzone_path: Path,
56
+ on_file_detected: Callable[[Path], None],
57
+ supported_extensions: Optional[Set[str]] = None,
58
+ ):
59
+ self.dropzone_path = Path(dropzone_path)
60
+ self.on_file_detected = on_file_detected
61
+ self.supported_extensions = supported_extensions or {
62
+ ".docx", ".doc", ".pdf", ".odt",
63
+ ".xlsx", ".xls", ".pptx", ".ppt",
64
+ }
65
+ self._processed_files: Set[Path] = set()
66
+
67
+ def on_created(self, event):
68
+ """Handle file creation events."""
69
+ if event.is_directory:
70
+ return
71
+
72
+ file_path = Path(event.src_path)
73
+ if self._should_process(file_path):
74
+ self._processed_files.add(file_path.resolve())
75
+ self.on_file_detected(file_path)
76
+
77
+ def on_moved(self, event):
78
+ """Handle file move events (e.g., atomic writes)."""
79
+ if event.is_directory:
80
+ return
81
+
82
+ file_path = Path(event.dest_path)
83
+ if self._should_process(file_path):
84
+ self._processed_files.add(file_path.resolve())
85
+ self.on_file_detected(file_path)
86
+
87
+ def _should_process(self, file_path: Path) -> bool:
88
+ """Check if a file should be processed."""
89
+ # Skip hidden files
90
+ if file_path.name.startswith("."):
91
+ return False
92
+
93
+ # Skip temporary files
94
+ if file_path.suffix in (".tmp", ".temp", ".part"):
95
+ return False
96
+
97
+ # Check extension
98
+ if file_path.suffix.lower() not in self.supported_extensions:
99
+ return False
100
+
101
+ # Skip already processed
102
+ if file_path.resolve() in self._processed_files:
103
+ return False
104
+
105
+ return True
106
+
107
+
108
+ class DropzoneWatcher:
109
+ """
110
+ Watches dropzone directories and orchestrates automated ingestion.
111
+
112
+ Features:
113
+ - Real-time file system monitoring
114
+ - Automatic conversion using ConversionWorker
115
+ - Artifact registration with ArtifactManager
116
+ - Event callbacks for integration
117
+ """
118
+
119
+ def __init__(
120
+ self,
121
+ dropzone_path: Path,
122
+ artifact_manager: ArtifactManager,
123
+ conversion_worker: Optional[ConversionWorker] = None,
124
+ output_dir: Optional[Path] = None,
125
+ process_existing: bool = False,
126
+ ):
127
+ """
128
+ Initialize the dropzone watcher.
129
+
130
+ Args:
131
+ dropzone_path: Directory to monitor for new files
132
+ artifact_manager: ArtifactManager for registering converted files
133
+ conversion_worker: ConversionWorker for document conversion
134
+ output_dir: Directory for converted files (default: dropzone/converted)
135
+ process_existing: Whether to process files already in dropzone
136
+ """
137
+ self.dropzone_path = Path(dropzone_path)
138
+ self.artifact_manager = artifact_manager
139
+ self.conversion_worker = conversion_worker or ConversionWorker()
140
+ self.output_dir = output_dir or (self.dropzone_path / "converted")
141
+ self.process_existing = process_existing
142
+
143
+ # Event callbacks
144
+ self._on_event: Optional[Callable[[IngestionEvent], None]] = None
145
+
146
+ # State
147
+ self._observer: Optional[Observer] = None
148
+ self._running = False
149
+ self._pending_tasks: dict[str, asyncio.Task] = {}
150
+
151
+ def set_event_callback(self, callback: Callable[[IngestionEvent], None]) -> None:
152
+ """Set callback for ingestion events."""
153
+ self._on_event = callback
154
+
155
+ def _emit_event(self, event: IngestionEvent) -> None:
156
+ """Emit an ingestion event."""
157
+ if self._on_event:
158
+ try:
159
+ self._on_event(event)
160
+ except Exception:
161
+ pass
162
+
163
+ def start(self) -> None:
164
+ """Start watching the dropzone directory."""
165
+ if self._running:
166
+ return
167
+
168
+ # Ensure directories exist
169
+ self.dropzone_path.mkdir(parents=True, exist_ok=True)
170
+ self.output_dir.mkdir(parents=True, exist_ok=True)
171
+
172
+ # Set up file system observer
173
+ self._handler = DropzoneHandler(
174
+ self.dropzone_path,
175
+ self._on_file_detected,
176
+ set(self.conversion_worker.get_supported_extensions()),
177
+ )
178
+
179
+ self._observer = Observer()
180
+ self._observer.schedule(self._handler, str(self.dropzone_path), recursive=False)
181
+ self._observer.start()
182
+
183
+ self._running = True
184
+ logger.info(f"Started watching dropzone: {self.dropzone_path}")
185
+
186
+ # Process existing files if requested
187
+ if self.process_existing:
188
+ self._scan_existing_files()
189
+
190
+ def stop(self) -> None:
191
+ """Stop watching the dropzone directory."""
192
+ if not self._running:
193
+ return
194
+
195
+ self._running = False
196
+
197
+ # Cancel pending tasks
198
+ for task in self._pending_tasks.values():
199
+ task.cancel()
200
+ self._pending_tasks.clear()
201
+
202
+ # Stop observer
203
+ if self._observer:
204
+ self._observer.stop()
205
+ self._observer.join()
206
+ self._observer = None
207
+
208
+ logger.info(f"Stopped watching dropzone: {self.dropzone_path}")
209
+
210
+ def _scan_existing_files(self) -> None:
211
+ """Scan and process existing files in dropzone."""
212
+ for file_path in self.dropzone_path.iterdir():
213
+ if file_path.is_file() and self._handler._should_process(file_path):
214
+ self._on_file_detected(file_path)
215
+
216
+ def _on_file_detected(self, file_path: Path) -> None:
217
+ """Handle newly detected file."""
218
+ logger.info(f"File detected: {file_path}")
219
+
220
+ self._emit_event(IngestionEvent(
221
+ event_type=IngestionEventType.FILE_DETECTED,
222
+ file_path=file_path,
223
+ ))
224
+
225
+ # Create async task for processing
226
+ task_id = str(uuid.uuid4())
227
+ asyncio.create_task(self._process_file(file_path, task_id))
228
+
229
+ async def _process_file(self, file_path: Path, task_id: str) -> None:
230
+ """Process a detected file through the ingestion pipeline."""
231
+ try:
232
+ # Step 1: Check if conversion is needed/possible
233
+ if not self.conversion_worker.can_convert(file_path):
234
+ logger.warning(f"Cannot convert file: {file_path}")
235
+ self._emit_event(IngestionEvent(
236
+ event_type=IngestionEventType.CONVERSION_FAILED,
237
+ file_path=file_path,
238
+ task_id=task_id,
239
+ error_message="No conversion tool available for this file type",
240
+ ))
241
+ return
242
+
243
+ # Step 2: Create conversion task
244
+ conversion_task = ConversionTask(
245
+ task_id=task_id,
246
+ source_path=file_path,
247
+ target_format="txt",
248
+ output_dir=self.output_dir,
249
+ )
250
+
251
+ self._emit_event(IngestionEvent(
252
+ event_type=IngestionEventType.CONVERSION_STARTED,
253
+ file_path=file_path,
254
+ task_id=task_id,
255
+ ))
256
+
257
+ # Step 3: Perform conversion
258
+ result = await self.conversion_worker.submit(conversion_task)
259
+
260
+ if result.status != ConversionStatus.SUCCESS:
261
+ logger.error(f"Conversion failed for {file_path}: {result.error_message}")
262
+ self._emit_event(IngestionEvent(
263
+ event_type=IngestionEventType.CONVERSION_FAILED,
264
+ file_path=file_path,
265
+ task_id=task_id,
266
+ error_message=result.error_message,
267
+ ))
268
+ return
269
+
270
+ self._emit_event(IngestionEvent(
271
+ event_type=IngestionEventType.CONVERSION_COMPLETED,
272
+ file_path=file_path,
273
+ task_id=task_id,
274
+ metadata={
275
+ "output_path": str(result.output_path),
276
+ "processing_time_ms": result.processing_time_ms,
277
+ },
278
+ ))
279
+
280
+ # Step 4: Register as artifact
281
+ if result.output_path and result.output_path.exists():
282
+ artifact_meta = self._register_artifact(
283
+ result.output_path,
284
+ source_file=file_path,
285
+ conversion_metadata=result.metadata,
286
+ )
287
+
288
+ self._emit_event(IngestionEvent(
289
+ event_type=IngestionEventType.ARTIFACT_REGISTERED,
290
+ file_path=file_path,
291
+ task_id=task_id,
292
+ artifact_id=artifact_meta.artifact_id,
293
+ metadata={
294
+ "content_hash": artifact_meta.content_hash,
295
+ "content_type": artifact_meta.content_type,
296
+ },
297
+ ))
298
+
299
+ logger.info(f"Successfully ingested {file_path} as artifact {artifact_meta.artifact_id}")
300
+
301
+ except Exception as e:
302
+ logger.exception(f"Error processing file {file_path}")
303
+ self._emit_event(IngestionEvent(
304
+ event_type=IngestionEventType.CONVERSION_FAILED,
305
+ file_path=file_path,
306
+ task_id=task_id,
307
+ error_message=str(e),
308
+ ))
309
+
310
+ def _register_artifact(
311
+ self,
312
+ file_path: Path,
313
+ source_file: Path,
314
+ conversion_metadata: dict[str, Any],
315
+ ) -> Any:
316
+ """Register converted file as an artifact."""
317
+ metadata = {
318
+ "source_file": str(source_file),
319
+ "original_filename": source_file.name,
320
+ **conversion_metadata,
321
+ }
322
+
323
+ return self.artifact_manager.store_file(
324
+ file_path=file_path,
325
+ source_type=ArtifactSourceType.IMPORTED,
326
+ content_type="text/plain",
327
+ tags=["mailroom", "converted", source_file.suffix.lower().lstrip(".")],
328
+ metadata=metadata,
329
+ )
330
+
331
+ def is_running(self) -> bool:
332
+ """Check if the watcher is currently running."""
333
+ return self._running
334
+
335
+ def get_stats(self) -> dict[str, Any]:
336
+ """Get watcher statistics."""
337
+ return {
338
+ "running": self._running,
339
+ "dropzone_path": str(self.dropzone_path),
340
+ "output_dir": str(self.output_dir),
341
+ "pending_tasks": len(self._pending_tasks),
342
+ "supported_extensions": list(self.conversion_worker.get_supported_extensions()),
343
+ }