agents-lab 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: agents-lab
3
+ Version: 0.1.0
4
+ Summary: LangChain LLM agents built on top of tokens-lab.
5
+ Author-email: Mohamed Moslemani <moslemanomohamed@gmail.com>, Youssef Moussallem <youssefaamoussallem@hotmail.com>
6
+ License-Expression: MIT
7
+ Keywords: llm,agents,langchain,langgraph
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Typing :: Typed
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: tokens-lab>=0.5.0
14
+ Requires-Dist: langgraph>=0.2.0
15
+ Provides-Extra: dev
16
+ Requires-Dist: pytest>=7.0; extra == "dev"
17
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
18
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
19
+ Requires-Dist: mypy>=1.8; extra == "dev"
@@ -0,0 +1,48 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "agents-lab"
7
+ version = "0.1.0"
8
+ description = "LangChain LLM agents built on top of tokens-lab."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+
13
+ authors = [
14
+ { name = "Mohamed Moslemani", email = "moslemanomohamed@gmail.com" },
15
+ { name = "Youssef Moussallem", email = "youssefaamoussallem@hotmail.com" }
16
+ ]
17
+
18
+ keywords = ["llm", "agents", "langchain", "langgraph"]
19
+
20
+ classifiers = [
21
+ "Programming Language :: Python :: 3",
22
+ "Typing :: Typed",
23
+ "Operating System :: OS Independent",
24
+ ]
25
+
26
+ dependencies = [
27
+ "tokens-lab>=0.5.0",
28
+ "langgraph>=0.2.0",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "pytest>=7.0",
34
+ "pytest-cov>=4.0",
35
+ "ruff>=0.4.0",
36
+ "mypy>=1.8",
37
+ ]
38
+
39
+ [tool.setuptools]
40
+ package-dir = { "" = "src" }
41
+
42
+ [tool.setuptools.packages.find]
43
+ where = ["src"]
44
+ include = ["llm_agents*"]
45
+ namespaces = false
46
+
47
+ [tool.setuptools.package-data]
48
+ "llm_agents" = ["py.typed"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: agents-lab
3
+ Version: 0.1.0
4
+ Summary: LangChain LLM agents built on top of tokens-lab.
5
+ Author-email: Mohamed Moslemani <moslemanomohamed@gmail.com>, Youssef Moussallem <youssefaamoussallem@hotmail.com>
6
+ License-Expression: MIT
7
+ Keywords: llm,agents,langchain,langgraph
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Typing :: Typed
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: tokens-lab>=0.5.0
14
+ Requires-Dist: langgraph>=0.2.0
15
+ Provides-Extra: dev
16
+ Requires-Dist: pytest>=7.0; extra == "dev"
17
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
18
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
19
+ Requires-Dist: mypy>=1.8; extra == "dev"
@@ -0,0 +1,16 @@
1
+ pyproject.toml
2
+ src/agents_lab.egg-info/PKG-INFO
3
+ src/agents_lab.egg-info/SOURCES.txt
4
+ src/agents_lab.egg-info/dependency_links.txt
5
+ src/agents_lab.egg-info/requires.txt
6
+ src/agents_lab.egg-info/top_level.txt
7
+ src/llm_agents/__init__.py
8
+ src/llm_agents/invoice_parsing_agent.py
9
+ src/llm_agents/name_consolidator.py
10
+ src/llm_agents/name_mapper.py
11
+ src/llm_agents/name_mapper_ml.py
12
+ src/llm_agents/py.typed
13
+ src/llm_agents/resume_slide_identifier.py
14
+ src/llm_agents/resume_structurer.py
15
+ src/llm_agents/resumes_consolidator.py
16
+ src/llm_agents/states.py
@@ -0,0 +1,8 @@
1
+ tokens-lab>=0.5.0
2
+ langgraph>=0.2.0
3
+
4
+ [dev]
5
+ pytest>=7.0
6
+ pytest-cov>=4.0
7
+ ruff>=0.4.0
8
+ mypy>=1.8
@@ -0,0 +1 @@
1
+ llm_agents
@@ -0,0 +1,58 @@
1
+ """LLM Agents — LangChain/LangGraph agents built on tokens-lab."""
2
+
3
+ from __future__ import annotations
4
+
5
+ __version__ = "0.1.0"
6
+
7
+
8
+ def __getattr__(name: str):
9
+ """Lazy-load public symbols on first access."""
10
+ _imports: dict[str, tuple[str, str]] = {
11
+ # agents
12
+ "InvoiceParsingAgent": (".invoice_parsing_agent", "InvoiceParsingAgent"),
13
+ "ResumeSlideIdentifier": (".resume_slide_identifier", "ResumeSlideIdentifier"),
14
+ "ResumeStructurer": (".resume_structurer", "ResumeStructurer"),
15
+ "ResumesConsolidator": (".resumes_consolidator", "ResumesConsolidator"),
16
+ "NameMapper": (".name_mapper", "NameMapper"),
17
+ "NameConsolidator": (".name_consolidator", "NameConsolidator"),
18
+ "auto_match_names": (".name_mapper_ml", "auto_match_names"),
19
+ "extract_name": (".name_mapper_ml", "extract_name"),
20
+ # states
21
+ "BaseState": (".states", "BaseState"),
22
+ "InvoiceState": (".states", "InvoiceState"),
23
+ "NameMatchingState": (".states", "NameMatchingState"),
24
+ "ResumeState": (".states", "ResumeState"),
25
+ "CVParserState": (".states", "CVParserState"),
26
+ "ConsolidateState": (".states", "ConsolidateState"),
27
+ "DedupState": (".states", "DedupState"),
28
+ }
29
+
30
+ if name in _imports:
31
+ module_path, attr = _imports[name]
32
+ import importlib
33
+ mod = importlib.import_module(module_path, __name__)
34
+ return getattr(mod, attr)
35
+
36
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
37
+
38
+
39
+ __all__ = [
40
+ "__version__",
41
+ # agents
42
+ "InvoiceParsingAgent",
43
+ "ResumeSlideIdentifier",
44
+ "ResumeStructurer",
45
+ "ResumesConsolidator",
46
+ "NameMapper",
47
+ "NameConsolidator",
48
+ "auto_match_names",
49
+ "extract_name",
50
+ # states
51
+ "BaseState",
52
+ "InvoiceState",
53
+ "NameMatchingState",
54
+ "ResumeState",
55
+ "CVParserState",
56
+ "ConsolidateState",
57
+ "DedupState",
58
+ ]
@@ -0,0 +1,284 @@
1
+ """Invoice processing agent using LangGraph.
2
+
3
+ Handles multi-step processing of invoices from PDFs and text files.
4
+ Goes through: load document → clean text → extract with LLM → parse JSON → validate.
5
+ Retries automatically when errors happen.
6
+
7
+ Steps:
8
+ - Load document (text + PDF images)
9
+ - Extract invoice data using LLM
10
+ - Parse the output into JSON
11
+ - Validate the data
12
+ - Retry if there are errors (up to configured max)
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ import threading
19
+ from typing import Any, Callable
20
+
21
+ from langgraph.graph import END
22
+
23
+ from llm_lab.agent import Agent, LLMClient
24
+ from llm_lab.filetextio import load_text, Pdf
25
+ from llm_lab.utils import _clean_llm_json_output
26
+ from llm_lab.preprocessing import normalize_text
27
+
28
+ from .states import InvoiceState
29
+
30
+ _pdf_image_lock = threading.Lock()
31
+ _BASE_SYSTEM_RULES = """
32
+ ==================================================
33
+ OUTPUT FORMAT (STRICT — ENFORCED BY AGENT)
34
+ ==================================================
35
+ - Output ONE JSON object ONLY
36
+ - NO markdown, NO code fences, NO comments, NO explanations
37
+ - Use null for missing or uncertain values
38
+ - Keys MUST match the target field names EXACTLY
39
+ - Extract VALUES ONLY (no labels, no prefixes)
40
+
41
+ ==================================================
42
+ VALUE FORMATTING RULES (ENFORCED BY AGENT)
43
+ ==================================================
44
+ - Monetary values: number only (no currency symbols, no words)
45
+ - Preserve original numeric formatting (commas/decimals)
46
+ - If a value is not explicitly printed → null
47
+ """.strip()
48
+
49
+ class InvoiceParsingAgent(Agent):
50
+ """Agent for invoice extraction and validation.
51
+
52
+ Workflow: load → clean → extract → parse → validate → retry if needed.
53
+
54
+ The agent enforces base output format rules (JSON, null handling) automatically.
55
+ Your system_prompt should contain domain-specific extraction rules only — the
56
+ base rules are prepended by the agent.
57
+
58
+ Args:
59
+ client: LLMClient instance for LLM calls.
60
+ devsettings: Retry limits and timeouts.
61
+ logger: Logger instance.
62
+ error_handler: Class with _ensure_defaults(state) and _set_error(msg, exc) static methods.
63
+ validate_fn: Callable(parsed_json, state) -> (is_valid, validated, updates).
64
+ arabic_instruction: String appended to user prompts for Arabic text handling.
65
+ system_prompt: Domain-specific extraction rules (output format rules are added automatically).
66
+ """
67
+
68
+ def __init__(
69
+ self,
70
+ client: LLMClient,
71
+ devsettings: Any,
72
+ logger: Any,
73
+ error_handler: Any,
74
+ validate_fn: Callable,
75
+ arabic_instruction: str,
76
+ system_prompt: str,
77
+ ) -> None:
78
+ super().__init__(InvoiceState)
79
+
80
+ self._client = client
81
+ self.devsettings = devsettings
82
+ self._logger = logger
83
+ self._error_handler = error_handler
84
+ self._validate_fn = validate_fn
85
+ self._arabic_instruction = arabic_instruction
86
+ self._system_prompt = f"{_BASE_SYSTEM_RULES}\n\n{system_prompt}"
87
+ self.setup_graph()
88
+
89
+ def setup_graph(self) -> None:
90
+ self.add_node("ingest", self.node_ingest)
91
+ self.add_node("preprocess", self.node_preprocess)
92
+ self.add_node("extract", self.node_extract)
93
+ self.add_node("parse_json", self.node_parse_json)
94
+ self.add_node("validate", self.node_validate)
95
+ self.add_node("increment_retry", self.node_increment_retry)
96
+
97
+ self.set_entry_point("ingest")
98
+
99
+ self._guarded("ingest", "preprocess")
100
+ self._guarded("preprocess", "extract")
101
+ self._guarded("extract", "parse_json")
102
+ self._guarded("parse_json", "validate")
103
+ self._retry_edge("validate", "increment_retry", self.should_retry)
104
+ self._guarded("increment_retry", "extract")
105
+
106
+ self.compile()
107
+
108
+ self._logger.info(f"InvoiceAgent initialized with model: {self._client.model_name}")
109
+
110
+ def node_ingest(self, state: InvoiceState) -> InvoiceState:
111
+ updates = self._error_handler._ensure_defaults(state)
112
+ path = state.get("path")
113
+ self._logger.info(f"Starting document ingestion: {path}")
114
+
115
+ if not path:
116
+ return {**updates, "stop_agent": True, "error": "Missing document path"}
117
+
118
+ raw_text = ""
119
+ pdf_images = []
120
+ try:
121
+ raw_text = load_text(path) or ""
122
+ except Exception as exc:
123
+ err_update = self._error_handler._set_error("Could not read text content", exc)
124
+ updates.update(err_update)
125
+ raw_text = ""
126
+
127
+ try:
128
+ with _pdf_image_lock:
129
+ extracted = Pdf.pdf_to_base64_images(path, max_pages=self.devsettings.max_pages)
130
+ pdf_images = extracted or []
131
+ if not pdf_images:
132
+ self._logger.warning(f"No images extracted from PDF: {path}")
133
+ except Exception as exc:
134
+ self._logger.error(f"PDF image extraction failed for {path}: {str(exc)}", exc_info=True)
135
+ err_update = self._error_handler._set_error("Could not process PDF content", exc)
136
+ updates.update(err_update)
137
+ pdf_images = []
138
+
139
+ has_images_content = any(isinstance(img, str) and img.strip() for img in pdf_images)
140
+ if not (raw_text.strip() or has_images_content):
141
+ self._logger.error("No content extracted from document")
142
+ return {**updates, "stop_agent": True, "error": updates.get("error", state.get("error", ""))}
143
+
144
+ self._logger.info(f"Document ingested successfully: {len(raw_text)} chars, {len(pdf_images)} images")
145
+ return {**updates, "raw_text": raw_text, "pdf_images": pdf_images, "retry_count": 0, "error": "", "last_error": updates.get("error", state.get("error", ""))}
146
+
147
+ def node_preprocess(self, state: InvoiceState) -> InvoiceState:
148
+ raw_text = state.get("raw_text", "")
149
+
150
+ try:
151
+ clean_text = normalize_text(raw_text)
152
+ except Exception as exc:
153
+ return {**state, "stop_agent": True, "error": f"Text normalization failed: {str(exc)}"}
154
+
155
+ return {"clean_text": clean_text, "error": "", "last_error": state.get("error", "")}
156
+
157
+ def node_extract(self, state: InvoiceState) -> InvoiceState:
158
+ try:
159
+ user_prompt = state.get("user_prompt", "")
160
+ feedback = state.get("validation_feedback", "")
161
+ pdf_images = state.get("pdf_images", [])
162
+ clean_text = state.get("clean_text", "")
163
+
164
+ if feedback:
165
+ user_prompt = (
166
+ f"{user_prompt}{self._arabic_instruction}\n"
167
+ f"PREVIOUS EXTRACTION HAD ISSUES:\n{feedback}\n\n"
168
+ "Please re-extract and fix the validation issues above while strictly following the Arabic text preservation rules."
169
+ )
170
+ else:
171
+ user_prompt = f"{user_prompt}{self._arabic_instruction}"
172
+
173
+ # Build messages with optional image support
174
+ messages = [{"role": "system", "content": self._system_prompt}]
175
+
176
+ if pdf_images:
177
+ content_items = [
178
+ {
179
+ "type": "input_text",
180
+ "text": (
181
+ user_prompt
182
+ + ", analyze the invoice images and extracted text below. "
183
+ "Use the visual layout across pages, tables, and formatting to accurately identify and extract fields.\n\n"
184
+ "Extracted text:\n" + clean_text
185
+ ),
186
+ }
187
+ ]
188
+ for img_b64 in pdf_images:
189
+ content_items.append({
190
+ "type": "input_image",
191
+ "image_url": f"data:image/jpeg;base64,{img_b64}",
192
+ })
193
+ messages.append({"role": "user", "content": content_items})
194
+ else:
195
+ messages.append({
196
+ "role": "user",
197
+ "content": user_prompt + ", find the current invoice text below:\n\n" + clean_text,
198
+ })
199
+
200
+ self._logger.info("Calling Invoice extraction: %s", self._client.model_name)
201
+ result_content = self._client.generate_response(messages)
202
+ self._logger.info("Invoice extraction response received: %d chars", len(result_content) if result_content else 0)
203
+
204
+ if result_content is None:
205
+ self._logger.error("LLM extraction returned None")
206
+ return {"llm_output": "", "error": "Could not extract invoice data."}
207
+
208
+ self._logger.info(f"LLM extraction successful: {len(result_content)} chars")
209
+
210
+ return {"llm_output": result_content, "error": "", "last_error": state.get("error", "")}
211
+
212
+ except Exception as exc:
213
+ return {"llm_output": "", "error": f"Failed to send LLM request: {str(exc)}", "last_error": state.get("error", "")}
214
+
215
+ def node_parse_json(self, state: InvoiceState) -> InvoiceState:
216
+ llm_output = state.get("llm_output") or ""
217
+
218
+ try:
219
+ cleaned_output = _clean_llm_json_output(llm_output, self._logger)
220
+ parsed = json.loads(cleaned_output)
221
+ return {"parsed_json": parsed if isinstance(parsed, dict) else {"_root": parsed}, "error": "", "last_error": state.get("error", "")}
222
+ except json.JSONDecodeError as e:
223
+ return {"parsed_json": {}, "error": f"Invalid JSON: {str(e)}", "last_error": state.get("error", "")}
224
+ except Exception as e:
225
+ return {"parsed_json": {}, "error": f"Failed to parse JSON: {str(e)}", "last_error": state.get("error", "")}
226
+
227
+ def node_validate(self, state: InvoiceState) -> InvoiceState:
228
+ parsed = state.get("parsed_json", {})
229
+ try:
230
+ is_valid, validated, v_updates = self._validate_fn(parsed, state)
231
+ self._logger.info(f"Validation completed: is_valid={is_valid}")
232
+ except Exception as exc:
233
+ self._logger.error(f"Validation failed: {str(exc)}", exc_info=True)
234
+ return {**self._error_handler._set_error("validate/validate_invoice", exc), "validated_data": {}, "extract": {}}
235
+
236
+ validation_info = (validated or {}).get("_validation", {})
237
+ errors = validation_info.get("errors", []) or []
238
+ warnings = validation_info.get("warnings", []) or []
239
+ feedback_parts = []
240
+ if errors:
241
+ feedback_parts.append("ERRORS:\n" + "\n".join(f"- {e}" for e in errors))
242
+ if warnings:
243
+ feedback_parts.append("WARNINGS:\n" + "\n".join(f"- {w}" for w in warnings))
244
+
245
+ feedback = "\n\n".join(feedback_parts) if feedback_parts else ""
246
+
247
+ if not is_valid and not errors:
248
+ feedback = feedback or "Validation marked invalid but no explicit errors were returned."
249
+
250
+ feedback_safe = feedback.encode('ascii', errors='replace').decode('ascii')
251
+ self._logger.info(f"Validation feedback generated ({len(errors)} errors, {len(warnings)} warnings): {feedback_safe}")
252
+ return {
253
+ "validated_data": validated or {},
254
+ "extract": validated or {},
255
+ "validation_feedback": feedback,
256
+ "error": (v_updates.get("error") if isinstance(v_updates, dict) else None) or "",
257
+ "last_error": state.get("error", ""),
258
+ }
259
+
260
+ def should_retry(self, state: InvoiceState) -> str:
261
+ if state.get("stop_agent"):
262
+ self._logger.info("Agent stopped by stop_agent flag")
263
+ return "end"
264
+
265
+ retry_count = int(state.get("retry_count", 0) or 0)
266
+
267
+ if retry_count >= self.devsettings.max_retries:
268
+ self._logger.warning(f"Max retries reached ({retry_count}/{self.devsettings.max_retries})")
269
+ return "end"
270
+
271
+ if state.get("error"):
272
+ self._logger.info(f"Retrying extraction (attempt {retry_count + 1}/{self.devsettings.max_retries})")
273
+ return "retry"
274
+
275
+ validation_info = state.get("validated_data", {}).get("_validation", {})
276
+ errors = validation_info.get("errors", []) or []
277
+
278
+ if errors:
279
+ return "retry"
280
+
281
+ return "end"
282
+
283
+ def node_increment_retry(self, state: InvoiceState) -> InvoiceState:
284
+ return {"retry_count": state.get("retry_count", 0) + 1}