axetract 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. axetract-0.1.0/LICENSE +21 -0
  2. axetract-0.1.0/PKG-INFO +165 -0
  3. axetract-0.1.0/README.md +116 -0
  4. axetract-0.1.0/pyproject.toml +82 -0
  5. axetract-0.1.0/src/axetract/__init__.py +13 -0
  6. axetract-0.1.0/src/axetract/data_types.py +75 -0
  7. axetract-0.1.0/src/axetract/exceptions.py +43 -0
  8. axetract-0.1.0/src/axetract/extractor/__init__.py +0 -0
  9. axetract-0.1.0/src/axetract/extractor/axe_extractor.py +143 -0
  10. axetract-0.1.0/src/axetract/extractor/base_extractor.py +28 -0
  11. axetract-0.1.0/src/axetract/llm/__init__.py +0 -0
  12. axetract-0.1.0/src/axetract/llm/base_client.py +91 -0
  13. axetract-0.1.0/src/axetract/llm/hf_client.py +319 -0
  14. axetract-0.1.0/src/axetract/llm/litellm_client.py +98 -0
  15. axetract-0.1.0/src/axetract/llm/llm_utils.py +18 -0
  16. axetract-0.1.0/src/axetract/llm/vllm_client.py +135 -0
  17. axetract-0.1.0/src/axetract/pipeline.py +533 -0
  18. axetract-0.1.0/src/axetract/postprocessor/__init__.py +0 -0
  19. axetract-0.1.0/src/axetract/postprocessor/axe_postprocessor.py +192 -0
  20. axetract-0.1.0/src/axetract/postprocessor/base_postprocessor.py +28 -0
  21. axetract-0.1.0/src/axetract/preprocessor/__init__.py +0 -0
  22. axetract-0.1.0/src/axetract/preprocessor/axe_preprocessor.py +181 -0
  23. axetract-0.1.0/src/axetract/preprocessor/base_preprocessor.py +28 -0
  24. axetract-0.1.0/src/axetract/prompts/__init__.py +0 -0
  25. axetract-0.1.0/src/axetract/prompts/pruner_prompt.py +27 -0
  26. axetract-0.1.0/src/axetract/prompts/qa_prompt.py +37 -0
  27. axetract-0.1.0/src/axetract/prompts/schema_prompt.py +30 -0
  28. axetract-0.1.0/src/axetract/pruner/__init__.py +0 -0
  29. axetract-0.1.0/src/axetract/pruner/axe_pruner.py +324 -0
  30. axetract-0.1.0/src/axetract/pruner/base_pruner.py +28 -0
  31. axetract-0.1.0/src/axetract/server.py +154 -0
  32. axetract-0.1.0/src/axetract/utils/__init__.py +0 -0
  33. axetract-0.1.0/src/axetract/utils/file_util.py +20 -0
  34. axetract-0.1.0/src/axetract/utils/html_util.py +1042 -0
  35. axetract-0.1.0/src/axetract/utils/json_util.py +101 -0
  36. axetract-0.1.0/src/axetract/utils/llm_util.py +49 -0
  37. axetract-0.1.0/src/axetract/utils/logging_util.py +60 -0
  38. axetract-0.1.0/src/axetract/utils/seed_util.py +22 -0
axetract-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) [2026] [Abdelrahman Mansour, Khaled Alshaer]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,165 @@
1
+ Metadata-Version: 2.3
2
+ Name: axetract
3
+ Version: 0.1.0
4
+ Summary: Low-Cost Cross-Domain Web Structured Information Extraction using specialized LoRA adapters.
5
+ Keywords: web-extraction,llm,structured-data,html-pruning,lora,vllm
6
+ Author: Abdelrahman Mansour, Khaled Alshaer
7
+ Author-email: Abdelrahman Mansour <abdelrahman.f.mansour@gmail.com>, Khaled Alshaer <khaled.w.alshaer@gmail.com>
8
+ License: MIT License
9
+
10
+ Copyright (c) [2026] [Abdelrahman Mansour, Khaled Alshaer]
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in all
20
+ copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+ Classifier: Development Status :: 4 - Beta
30
+ Classifier: Intended Audience :: Developers
31
+ Classifier: License :: OSI Approved :: MIT License
32
+ Classifier: Programming Language :: Python :: 3.12
33
+ Requires-Dist: fastapi>=0.135.1
34
+ Requires-Dist: html-chunking>=0.0.4
35
+ Requires-Dist: htmlrag>=0.1.1
36
+ Requires-Dist: json-repair>=0.58.1
37
+ Requires-Dist: peft>=0.18.1
38
+ Requires-Dist: pydantic>=2.13.0b2
39
+ Requires-Dist: torch>=2.9.1
40
+ Requires-Dist: transformers>=4.57.6
41
+ Requires-Dist: uvicorn>=0.41.0
42
+ Requires-Dist: vllm>=0.16.0 ; extra == 'vllm'
43
+ Requires-Python: >=3.12
44
+ Project-URL: Repository, https://github.com/abdo-Mansour/axetract
45
+ Project-URL: Documentation, https://abdo-mansour.github.io/axetract/
46
+ Project-URL: Issue Tracker, https://github.com/abdo-Mansour/axetract/issues
47
+ Provides-Extra: vllm
48
+ Description-Content-Type: text/markdown
49
+
50
+ <div align="center">
51
+ <img src="docs/assets/logo-white-mode.svg#gh-light-mode-only" alt="AXEtract Logo" width="400">
52
+ <img src="docs/assets/logo-black-mode.svg#gh-dark-mode-only" alt="AXEtract Logo" width="400">
53
+ <h1>AXEtract</h1>
54
+ <h3>Low-Cost Cross-Domain Web Structured Information Extraction</h3>
55
+
56
+ [!\[Documentation\](https://img.shields.io/badge/docs-latest-teal null)](https://abdo-mansour.github.io/axetract/)
57
+ [!\[License: MIT\](https://img.shields.io/badge/License-MIT-yellow.svg null)](https://opensource.org/licenses/MIT)
58
+ [!\[GitHub\](https://img.shields.io/github/stars/abdo-Mansour/axetract?style=social null)](https://github.com/abdo-Mansour/axetract)
59
+
60
+ </div>
61
+
62
+ ***
63
+
64
+ **AXEtract** is a high-performance, low-cost framework for extracting structured data from web pages. Based on the paper **"AXE: Low-Cost Cross-Domain Web Structured Information Extraction"**, it optimizes the extraction pipeline by using specialized LoRA adapters for pruning and query-specific extraction, enabling state-of-the-art results with small models (e.g., Qwen3-0.6B).
65
+
66
+ ## 🚀 Key Features
67
+
68
+ - **🎯 Specialized LoRA Adapters**: Uses task-specific adapters for DOM pruning and structured extraction, achieving high accuracy with minimal token overhead.
69
+ - **✂️ Smart DOM Pruning**: Classifies and prunes irrelevant HTML nodes before passing them to the extractor, significantly reducing context window usage and costs.
70
+ - **📍 Grounded XPath Resolution (GXR)**: Automatically maps extracted JSON fields back to their original source XPaths in the DOM for verification and grounding.
71
+ - **⚡ High-Throughput Pipeline**: Built-in support for multiple LLM engines, including **vLLM** for production-grade serving and **HuggingFace** for local research.
72
+ - **🌐 Cross-Domain Versatility**: Designed to generalize across various web domains (e-commerce, real estate, listings) without needing domain-specific rules.
73
+
74
+ ## 🛠️ Architecture
75
+
76
+ AXEtract follows a three-part decoupled pipeline for maximum efficiency:
77
+
78
+ 1. **Preprocessor**: Fetches raw HTML and chunks it into manageable, token-aware fragments.
79
+ 2. **AI Extractor**: Divided into two stages:
80
+ - **Pruner**: A lightweight LLM (LoRA-powered) filters out noise and selects only relevant HTML chunks.
81
+ - **Extractor**: A task-specific LLM maps the pruned HTML content directly to a structured JSON schema or natural language answer.
82
+ 3. **Postprocessor**: Validates the output and resolves source XPaths via Grounded XPath Resolution (GXR).
83
+
84
+ ## 📦 Installation
85
+
86
+ ```bash
87
+ # Install from PyPI
88
+ uv pip install axetract
89
+
90
+ # Or install from source
91
+ git clone https://github.com/abdo-Mansour/axetract.git
92
+ cd axetract
93
+ uv sync
94
+ ```
95
+
96
+ ## 🚥 Quick Start
97
+
98
+ ```python
99
+ from pydantic import BaseModel
100
+ from axetract.pipeline import AXEPipeline
101
+
102
+ # 1. Initialize the pipeline with default LoRA adapters
103
+ # (Automatically downloads adapters from HuggingFace Hub)
104
+ pipeline = AXEPipeline.from_config(use_vllm=False)
105
+
106
+ # 2. Define your desired extraction schema
107
+ class Product(BaseModel):
108
+ name: str
109
+ price: str
110
+ rating: float
111
+
112
+ # 3. Extract from a URL or raw HTML
113
+ url = "https://example.com/item/12345"
114
+ result = pipeline.extract(url, schema=Product)
115
+
116
+ # 4. Access your structured data
117
+ print(f"Status: {result.status}")
118
+ print(f"Prediction: {result.prediction}")
119
+ print(f"Source XPaths: {result.xpaths}")
120
+ ```
121
+
122
+ ## 🌐 API Server
123
+
124
+ AXEtract includes a built-in FastAPI server for high-throughput serving. After installing the package, start it with the installed CLI entry point:
125
+
126
+ ```bash
127
+ axe-server
128
+ ```
129
+
130
+ Or via `python -m` for development installs:
131
+
132
+ ```bash
133
+ python -m axetract.server
134
+ ```
135
+
136
+ Configuration is done via environment variables:
137
+
138
+ | Variable | Default | Description |
139
+ |---|---|---|
140
+ | `AXE_USE_VLLM` | `false` | Set to `true` to use vLLM backend |
141
+ | `AXE_PORT` | `8000` | Port to listen on |
142
+ | `AXE_HOST` | `0.0.0.0` | Host to bind to |
143
+ | `AXE_LOG_FILE` | _(stderr)_ | Optional path to a log file |
144
+
145
+ See `axe_server/client_example.py` for examples of interacting with the API via `requests`.
146
+
147
+ ## 📝 Citation
148
+
149
+ If you use AXEtract in your research, please cite our paper:
150
+
151
+ ```bibtex
152
+ @misc{mansour2026axe,
153
+ title={AXE: Low-Cost Cross-Domain Web Structured Information Extraction},
154
+ author={Abdelrahman Mansour and Khaled W. Alshaer and Moataz Elsaban},
155
+ year={2026},
156
+ eprint={2602.01838},
157
+ archivePrefix={arXiv},
158
+ primaryClass={cs.CL},
159
+ url={https://arxiv.org/abs/2602.01838},
160
+ }
161
+ ```
162
+
163
+ ## 📜 License
164
+
165
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,116 @@
1
+ <div align="center">
2
+ <img src="docs/assets/logo-white-mode.svg#gh-light-mode-only" alt="AXEtract Logo" width="400">
3
+ <img src="docs/assets/logo-black-mode.svg#gh-dark-mode-only" alt="AXEtract Logo" width="400">
4
+ <h1>AXEtract</h1>
5
+ <h3>Low-Cost Cross-Domain Web Structured Information Extraction</h3>
6
+
7
+ [!\[Documentation\](https://img.shields.io/badge/docs-latest-teal null)](https://abdo-mansour.github.io/axetract/)
8
+ [!\[License: MIT\](https://img.shields.io/badge/License-MIT-yellow.svg null)](https://opensource.org/licenses/MIT)
9
+ [!\[GitHub\](https://img.shields.io/github/stars/abdo-Mansour/axetract?style=social null)](https://github.com/abdo-Mansour/axetract)
10
+
11
+ </div>
12
+
13
+ ***
14
+
15
+ **AXEtract** is a high-performance, low-cost framework for extracting structured data from web pages. Based on the paper **"AXE: Low-Cost Cross-Domain Web Structured Information Extraction"**, it optimizes the extraction pipeline by using specialized LoRA adapters for pruning and query-specific extraction, enabling state-of-the-art results with small models (e.g., Qwen3-0.6B).
16
+
17
+ ## 🚀 Key Features
18
+
19
+ - **🎯 Specialized LoRA Adapters**: Uses task-specific adapters for DOM pruning and structured extraction, achieving high accuracy with minimal token overhead.
20
+ - **✂️ Smart DOM Pruning**: Classifies and prunes irrelevant HTML nodes before passing them to the extractor, significantly reducing context window usage and costs.
21
+ - **📍 Grounded XPath Resolution (GXR)**: Automatically maps extracted JSON fields back to their original source XPaths in the DOM for verification and grounding.
22
+ - **⚡ High-Throughput Pipeline**: Built-in support for multiple LLM engines, including **vLLM** for production-grade serving and **HuggingFace** for local research.
23
+ - **🌐 Cross-Domain Versatility**: Designed to generalize across various web domains (e-commerce, real estate, listings) without needing domain-specific rules.
24
+
25
+ ## 🛠️ Architecture
26
+
27
+ AXEtract follows a three-part decoupled pipeline for maximum efficiency:
28
+
29
+ 1. **Preprocessor**: Fetches raw HTML and chunks it into manageable, token-aware fragments.
30
+ 2. **AI Extractor**: Divided into two stages:
31
+ - **Pruner**: A lightweight LLM (LoRA-powered) filters out noise and selects only relevant HTML chunks.
32
+ - **Extractor**: A task-specific LLM maps the pruned HTML content directly to a structured JSON schema or natural language answer.
33
+ 3. **Postprocessor**: Validates the output and resolves source XPaths via Grounded XPath Resolution (GXR).
34
+
35
+ ## 📦 Installation
36
+
37
+ ```bash
38
+ # Install from PyPI
39
+ uv pip install axetract
40
+
41
+ # Or install from source
42
+ git clone https://github.com/abdo-Mansour/axetract.git
43
+ cd axetract
44
+ uv sync
45
+ ```
46
+
47
+ ## 🚥 Quick Start
48
+
49
+ ```python
50
+ from pydantic import BaseModel
51
+ from axetract.pipeline import AXEPipeline
52
+
53
+ # 1. Initialize the pipeline with default LoRA adapters
54
+ # (Automatically downloads adapters from HuggingFace Hub)
55
+ pipeline = AXEPipeline.from_config(use_vllm=False)
56
+
57
+ # 2. Define your desired extraction schema
58
+ class Product(BaseModel):
59
+ name: str
60
+ price: str
61
+ rating: float
62
+
63
+ # 3. Extract from a URL or raw HTML
64
+ url = "https://example.com/item/12345"
65
+ result = pipeline.extract(url, schema=Product)
66
+
67
+ # 4. Access your structured data
68
+ print(f"Status: {result.status}")
69
+ print(f"Prediction: {result.prediction}")
70
+ print(f"Source XPaths: {result.xpaths}")
71
+ ```
72
+
73
+ ## 🌐 API Server
74
+
75
+ AXEtract includes a built-in FastAPI server for high-throughput serving. After installing the package, start it with the installed CLI entry point:
76
+
77
+ ```bash
78
+ axe-server
79
+ ```
80
+
81
+ Or via `python -m` for development installs:
82
+
83
+ ```bash
84
+ python -m axetract.server
85
+ ```
86
+
87
+ Configuration is done via environment variables:
88
+
89
+ | Variable | Default | Description |
90
+ |---|---|---|
91
+ | `AXE_USE_VLLM` | `false` | Set to `true` to use vLLM backend |
92
+ | `AXE_PORT` | `8000` | Port to listen on |
93
+ | `AXE_HOST` | `0.0.0.0` | Host to bind to |
94
+ | `AXE_LOG_FILE` | _(stderr)_ | Optional path to a log file |
95
+
96
+ See `axe_server/client_example.py` for examples of interacting with the API via `requests`.
97
+
98
+ ## 📝 Citation
99
+
100
+ If you use AXEtract in your research, please cite our paper:
101
+
102
+ ```bibtex
103
+ @misc{mansour2026axe,
104
+ title={AXE: Low-Cost Cross-Domain Web Structured Information Extraction},
105
+ author={Abdelrahman Mansour and Khaled W. Alshaer and Moataz Elsaban},
106
+ year={2026},
107
+ eprint={2602.01838},
108
+ archivePrefix={arXiv},
109
+ primaryClass={cs.CL},
110
+ url={https://arxiv.org/abs/2602.01838},
111
+ }
112
+ ```
113
+
114
+ ## 📜 License
115
+
116
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,82 @@
1
+ [project]
2
+ name = "axetract"
3
+ version = "0.1.0"
4
+ description = "Low-Cost Cross-Domain Web Structured Information Extraction using specialized LoRA adapters."
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ license = { file = "LICENSE" }
8
+ authors = [
9
+ {name="Abdelrahman Mansour", email = "abdelrahman.f.mansour@gmail.com"},
10
+ {name="Khaled Alshaer", email = "khaled.w.alshaer@gmail.com"},
11
+ ]
12
+ keywords = ["web-extraction", "llm", "structured-data", "html-pruning", "lora", "vllm"]
13
+ classifiers = [
14
+ "Development Status :: 4 - Beta",
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3.12",
18
+ ]
19
+
20
+ dependencies = [
21
+ "fastapi>=0.135.1",
22
+ "html-chunking>=0.0.4",
23
+ "htmlrag>=0.1.1",
24
+ "json-repair>=0.58.1",
25
+ "peft>=0.18.1",
26
+ "pydantic>=2.13.0b2",
27
+ "torch>=2.9.1",
28
+ "transformers>=4.57.6",
29
+ "uvicorn>=0.41.0",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ vllm = [
34
+ "vllm>=0.16.0",
35
+ ]
36
+
37
+ [project.urls]
38
+ Repository = "https://github.com/abdo-Mansour/axetract"
39
+ Documentation = "https://abdo-mansour.github.io/axetract/"
40
+ "Issue Tracker" = "https://github.com/abdo-Mansour/axetract/issues"
41
+
42
+ [project.scripts]
43
+ axe-server = "axetract.server:main"
44
+
45
+ [dependency-groups]
46
+ dev = [
47
+ "mkdocs>=1.6.1,<2.0.0",
48
+ "pytest>=8.0.0",
49
+ "mkdocs-material>=9.7.3",
50
+ "mkdocs-autoapi[python]",
51
+ "mkdocstrings[python]>=1.0.3",
52
+ "ruff>=0.15.4",
53
+ "jupyter>=1.1.1",
54
+ "notebook>=7.5.4",
55
+ "ipywidgets>=8.1.8",
56
+ ]
57
+
58
+ [tool.ruff]
59
+ line-length = 100
60
+
61
+ [tool.ruff.lint]
62
+ select = ["E", "F", "I", "N", "W", "D"]
63
+ ignore = ["D100", "D104", "E501"] # Ignore missing docstrings in modules and packages, and long lines
64
+
65
+ [tool.ruff.lint.per-file-ignores]
66
+ "tests/*" = ["D1"] # Don't require docstrings in tests
67
+ "*.ipynb" = ["W291"] # Ignore trailing whitespace in notebooks
68
+ "*.py" = ["E402"]
69
+
70
+ [tool.ruff.lint.pydocstyle]
71
+ convention = "google"
72
+
73
+ [tool.ruff.lint.isort]
74
+ known-first-party = ["axetract"]
75
+
76
+ [build-system]
77
+ requires = ["uv_build>=0.10.6,<0.11.0"]
78
+ build-backend = "uv_build"
79
+
80
+ [tool.uv.build-backend]
81
+ module-name = "axetract"
82
+ module-root = "src"
@@ -0,0 +1,13 @@
1
+ """AXEtract: Low-Cost Cross-Domain Web Structured Information Extraction."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ from axetract.data_types import AXEResult, AXESample, Status
6
+ from axetract.pipeline import AXEPipeline
7
+
8
+ try:
9
+ __version__ = version("axetract")
10
+ except PackageNotFoundError: # editable install not yet built
11
+ __version__ = "unknown"
12
+
13
+ __all__ = ["AXEPipeline", "AXESample", "AXEResult", "Status", "__version__"]
@@ -0,0 +1,75 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import Enum
4
+ from typing import Any, List, Optional, Type, Union
5
+
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class Status(Enum):
10
+ """Execution status for a processing sample."""
11
+
12
+ PENDING = "pending"
13
+ SUCCESS = "success"
14
+ FAILED = "failed"
15
+
16
+
17
+ class AXEChunk(BaseModel):
18
+ """A single chunk of HTML content.
19
+
20
+ Attributes:
21
+ chunkid (str): Unique identifier for the chunk.
22
+ content (str): The raw or cleaned HTML content.
23
+ """
24
+
25
+ chunkid: str
26
+ content: str
27
+
28
+
29
+ class AXESample(BaseModel):
30
+ """A data container for a single extraction request throughout the pipeline.
31
+
32
+ Attributes:
33
+ id (str): Unique identifier for the sample.
34
+ content (str): Input content (URL or raw HTML).
35
+ is_content_url (bool): Whether the content is a URL.
36
+ query (Optional[str]): Natural language extraction query.
37
+ schema_model (Optional[Union[str, Type[BaseModel], dict]]): Desired JSON schema.
38
+ chunks (List[AXEChunk]): List of processed HTML chunks.
39
+ original_html (str): The original, uncleaned HTML content.
40
+ current_html (str): The current state of HTML (e.g., after cleaning or pruning).
41
+ prediction (Optional[Union[str, dict, Any]]): The LLM's raw output or parsed JSON.
42
+ xpaths (Optional[dict]): Map of extracted fields to their source XPaths.
43
+ status (Status): Current processing status.
44
+ """
45
+
46
+ id: str
47
+ content: str
48
+ is_content_url: bool
49
+ query: Optional[str] = None
50
+ schema_model: Optional[Union[str, Type[BaseModel], dict]] = None
51
+ chunks: List[AXEChunk] = []
52
+ original_html: str = ""
53
+ current_html: str = ""
54
+ prediction: Optional[Union[str, dict, Any]] = None
55
+ xpaths: Optional[dict] = None
56
+
57
+ status: Status = Status.PENDING
58
+
59
+
60
+ class AXEResult(BaseModel):
61
+ """Final extraction result returned to the user.
62
+
63
+ Attributes:
64
+ id (str): Sample identifier.
65
+ prediction (Union[str, dict, Any]): The extracted structured data.
66
+ xpaths (Optional[dict]): Reference XPaths for the extracted values.
67
+ status (Status): Success or failure indicator.
68
+ error (Optional[str]): Error message if processing failed.
69
+ """
70
+
71
+ id: str
72
+ prediction: Union[str, dict, Any]
73
+ xpaths: Optional[dict] = None
74
+ status: Status
75
+ error: Optional[str] = None
@@ -0,0 +1,43 @@
1
+ """Custom exceptions for the Axetract package."""
2
+
3
+
4
+ class AXEError(Exception):
5
+ """Base class for all Axetract exceptions."""
6
+
7
+ pass
8
+
9
+
10
+ class PreprocessingError(AXEError):
11
+ """Raised when an error occurs during the preprocessing stage."""
12
+
13
+ pass
14
+
15
+
16
+ class PruningError(AXEError):
17
+ """Raised when an error occurs during the pruning stage."""
18
+
19
+ pass
20
+
21
+
22
+ class ExtractionError(AXEError):
23
+ """Raised when an error occurs during the extraction stage."""
24
+
25
+ pass
26
+
27
+
28
+ class PostprocessingError(AXEError):
29
+ """Raised when an error occurs during the postprocessing stage."""
30
+
31
+ pass
32
+
33
+
34
+ class ModelLoadError(AXEError):
35
+ """Raised when an error occurs while loading an LLM model."""
36
+
37
+ pass
38
+
39
+
40
+ class ConfigurationError(AXEError):
41
+ """Raised when there is an invalid configuration for the pipeline."""
42
+
43
+ pass
File without changes
@@ -0,0 +1,143 @@
1
+ import json
2
+ import logging
3
+ from typing import List
4
+
5
+ from pydantic import BaseModel
6
+
7
+ from axetract.data_types import AXESample, Status
8
+ from axetract.extractor.base_extractor import BaseExtractor
9
+ from axetract.llm.base_client import BaseClient
10
+ from axetract.utils.json_util import is_schema
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class AXEExtractor(BaseExtractor):
16
+ """Component for extracting structured data from HTML using LLMs.
17
+
18
+ Attributes:
19
+ llm_extractor_client (BaseClient): The LLM client used for extraction.
20
+ schema_prompt_template (str): Template for schema-based extraction prompts.
21
+ query_prompt_template (str): Template for natural language query prompts.
22
+ name (str): Component name.
23
+ batch_size (int): Processing batch size.
24
+ num_workers (int): Number of parallel workers.
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ llm_extractor_client: BaseClient,
30
+ schema_generation_prompt_template: str,
31
+ query_generation_prompt_template: str,
32
+ name: str = "axe_extractor",
33
+ batch_size: int = 16,
34
+ num_workers: int = 4,
35
+ ):
36
+ """Initialize the extractor.
37
+
38
+ Args:
39
+ llm_extractor_client (BaseClient): LLM client.
40
+ schema_generation_prompt_template (str): Schema prompt template.
41
+ query_generation_prompt_template (str): Query prompt template.
42
+ name (str): Component name.
43
+ batch_size (int): Batch size.
44
+ num_workers (int): Parallel workers.
45
+ """
46
+ self.llm_extractor_client = llm_extractor_client
47
+ self.name = name
48
+ self.batch_size = batch_size
49
+ self.num_workers = num_workers
50
+ self.schema_prompt_template = schema_generation_prompt_template
51
+ self.query_prompt_template = query_generation_prompt_template
52
+
53
+ def _generate_output(self, samples: List[AXESample]) -> List[AXESample]:
54
+
55
+ def build_prompt(data):
56
+ query = data.query or data.schema_model
57
+ content = data.current_html
58
+
59
+ # Convert Query/Schema to appropriate string if it is a dictionary or Pydantic model
60
+ if query is not None and not isinstance(query, str):
61
+ if isinstance(query, dict):
62
+ query = json.dumps(query)
63
+ elif isinstance(query, type) and issubclass(query, BaseModel):
64
+ # For Pydantic V2 use model_json_schema, for V1 use schema_json
65
+ if hasattr(query, "model_json_schema"):
66
+ query = json.dumps(query.model_json_schema())
67
+ elif hasattr(query, "schema_json"):
68
+ query = query.schema_json()
69
+
70
+ if is_schema(query):
71
+ return self.schema_prompt_template.format(query=query, content=content)
72
+ else:
73
+ return self.query_prompt_template.format(query=query, content=content)
74
+
75
+ prompts = [build_prompt(sample) for sample in samples]
76
+ queries = [sample.query or sample.schema_model for sample in samples]
77
+
78
+ # Storage for split batches
79
+ qa_indices = []
80
+ qa_prompts = []
81
+
82
+ schema_indices = []
83
+ schema_prompts = []
84
+
85
+ # 1. Split based on Query Type
86
+ for idx, (q, p) in enumerate(zip(queries, prompts)):
87
+ if is_schema(q):
88
+ schema_indices.append(idx)
89
+ schema_prompts.append(p)
90
+ else:
91
+ qa_indices.append(idx)
92
+ qa_prompts.append(p)
93
+
94
+ # Holder for final results in original order
95
+ final_responses = [None] * len(prompts)
96
+
97
+ # 2. Run QA Batch (Adapter: "qa")
98
+ if qa_prompts:
99
+ logger.debug("Processing %d QA queries...", len(qa_prompts))
100
+ for idx, (orig_idx, q) in enumerate(zip(qa_indices, queries)):
101
+ if not is_schema(q):
102
+ logger.debug(" [QA] sample %d query: %s", orig_idx, q)
103
+ logger.debug(" [QA] sample %d prompt: %s", orig_idx, qa_prompts[idx])
104
+ qa_responses = self.llm_extractor_client.call_batch(qa_prompts, adapter_name="qa")
105
+
106
+ for original_idx, response in zip(qa_indices, qa_responses):
107
+ logger.debug(" [QA] sample %d response: %s", original_idx, response)
108
+ final_responses[original_idx] = response
109
+
110
+ # 3. Run Schema Batch (Adapter: "schema")
111
+ if schema_prompts:
112
+ logger.debug("Processing %d Schema queries...", len(schema_prompts))
113
+ for idx, (orig_idx, q) in enumerate(zip(schema_indices, queries)):
114
+ if is_schema(q):
115
+ logger.debug(" [Schema] sample %d schema: %s", orig_idx, q)
116
+ logger.debug(" [Schema] sample %d prompt: %s", orig_idx, schema_prompts[idx])
117
+ schema_responses = self.llm_extractor_client.call_batch(
118
+ schema_prompts, adapter_name="schema"
119
+ )
120
+
121
+ for original_idx, response in zip(schema_indices, schema_responses):
122
+ logger.debug(" [Schema] sample %d response: %s", original_idx, response)
123
+ final_responses[original_idx] = response
124
+
125
+ for sample, response in zip(samples, final_responses):
126
+ sample.prediction = response
127
+ sample.status = Status.SUCCESS if response is not None else Status.FAILED
128
+ logger.debug(" [Extractor] sample %s final prediction: %s", sample.id, response)
129
+ return samples
130
+
131
+ def __call__(self, samples: List[AXESample]) -> List[AXESample]:
132
+ """Run the extraction process on a batch of samples.
133
+
134
+ Args:
135
+ samples (List[AXESample]): Input samples with clean HTML.
136
+
137
+ Returns:
138
+ List[AXESample]: Samples with LLM-generated predictions.
139
+ """
140
+ # Step 3: Generate (Optimized Parallel)
141
+ generated_samples = self._generate_output(samples)
142
+
143
+ return generated_samples