axetract 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- axetract-0.1.0/LICENSE +21 -0
- axetract-0.1.0/PKG-INFO +165 -0
- axetract-0.1.0/README.md +116 -0
- axetract-0.1.0/pyproject.toml +82 -0
- axetract-0.1.0/src/axetract/__init__.py +13 -0
- axetract-0.1.0/src/axetract/data_types.py +75 -0
- axetract-0.1.0/src/axetract/exceptions.py +43 -0
- axetract-0.1.0/src/axetract/extractor/__init__.py +0 -0
- axetract-0.1.0/src/axetract/extractor/axe_extractor.py +143 -0
- axetract-0.1.0/src/axetract/extractor/base_extractor.py +28 -0
- axetract-0.1.0/src/axetract/llm/__init__.py +0 -0
- axetract-0.1.0/src/axetract/llm/base_client.py +91 -0
- axetract-0.1.0/src/axetract/llm/hf_client.py +319 -0
- axetract-0.1.0/src/axetract/llm/litellm_client.py +98 -0
- axetract-0.1.0/src/axetract/llm/llm_utils.py +18 -0
- axetract-0.1.0/src/axetract/llm/vllm_client.py +135 -0
- axetract-0.1.0/src/axetract/pipeline.py +533 -0
- axetract-0.1.0/src/axetract/postprocessor/__init__.py +0 -0
- axetract-0.1.0/src/axetract/postprocessor/axe_postprocessor.py +192 -0
- axetract-0.1.0/src/axetract/postprocessor/base_postprocessor.py +28 -0
- axetract-0.1.0/src/axetract/preprocessor/__init__.py +0 -0
- axetract-0.1.0/src/axetract/preprocessor/axe_preprocessor.py +181 -0
- axetract-0.1.0/src/axetract/preprocessor/base_preprocessor.py +28 -0
- axetract-0.1.0/src/axetract/prompts/__init__.py +0 -0
- axetract-0.1.0/src/axetract/prompts/pruner_prompt.py +27 -0
- axetract-0.1.0/src/axetract/prompts/qa_prompt.py +37 -0
- axetract-0.1.0/src/axetract/prompts/schema_prompt.py +30 -0
- axetract-0.1.0/src/axetract/pruner/__init__.py +0 -0
- axetract-0.1.0/src/axetract/pruner/axe_pruner.py +324 -0
- axetract-0.1.0/src/axetract/pruner/base_pruner.py +28 -0
- axetract-0.1.0/src/axetract/server.py +154 -0
- axetract-0.1.0/src/axetract/utils/__init__.py +0 -0
- axetract-0.1.0/src/axetract/utils/file_util.py +20 -0
- axetract-0.1.0/src/axetract/utils/html_util.py +1042 -0
- axetract-0.1.0/src/axetract/utils/json_util.py +101 -0
- axetract-0.1.0/src/axetract/utils/llm_util.py +49 -0
- axetract-0.1.0/src/axetract/utils/logging_util.py +60 -0
- axetract-0.1.0/src/axetract/utils/seed_util.py +22 -0
axetract-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) [2026] [Abdelrahman Mansour, Khaled Alshaer]
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
axetract-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: axetract
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Low-Cost Cross-Domain Web Structured Information Extraction using specialized LoRA adapters.
|
|
5
|
+
Keywords: web-extraction,llm,structured-data,html-pruning,lora,vllm
|
|
6
|
+
Author: Abdelrahman Mansour, Khaled Alshaer
|
|
7
|
+
Author-email: Abdelrahman Mansour <abdelrahman.f.mansour@gmail.com>, Khaled Alshaer <khaled.w.alshaer@gmail.com>
|
|
8
|
+
License: MIT License
|
|
9
|
+
|
|
10
|
+
Copyright (c) [2026] [Abdelrahman Mansour, Khaled Alshaer]
|
|
11
|
+
|
|
12
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
13
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
14
|
+
in the Software without restriction, including without limitation the rights
|
|
15
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
16
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
17
|
+
furnished to do so, subject to the following conditions:
|
|
18
|
+
|
|
19
|
+
The above copyright notice and this permission notice shall be included in all
|
|
20
|
+
copies or substantial portions of the Software.
|
|
21
|
+
|
|
22
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
23
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
24
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
25
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
26
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
27
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
28
|
+
SOFTWARE.
|
|
29
|
+
Classifier: Development Status :: 4 - Beta
|
|
30
|
+
Classifier: Intended Audience :: Developers
|
|
31
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
32
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
33
|
+
Requires-Dist: fastapi>=0.135.1
|
|
34
|
+
Requires-Dist: html-chunking>=0.0.4
|
|
35
|
+
Requires-Dist: htmlrag>=0.1.1
|
|
36
|
+
Requires-Dist: json-repair>=0.58.1
|
|
37
|
+
Requires-Dist: peft>=0.18.1
|
|
38
|
+
Requires-Dist: pydantic>=2.13.0b2
|
|
39
|
+
Requires-Dist: torch>=2.9.1
|
|
40
|
+
Requires-Dist: transformers>=4.57.6
|
|
41
|
+
Requires-Dist: uvicorn>=0.41.0
|
|
42
|
+
Requires-Dist: vllm>=0.16.0 ; extra == 'vllm'
|
|
43
|
+
Requires-Python: >=3.12
|
|
44
|
+
Project-URL: Repository, https://github.com/abdo-Mansour/axetract
|
|
45
|
+
Project-URL: Documentation, https://abdo-mansour.github.io/axetract/
|
|
46
|
+
Project-URL: Issue Tracker, https://github.com/abdo-Mansour/axetract/issues
|
|
47
|
+
Provides-Extra: vllm
|
|
48
|
+
Description-Content-Type: text/markdown
|
|
49
|
+
|
|
50
|
+
<div align="center">
|
|
51
|
+
<img src="docs/assets/logo-white-mode.svg#gh-light-mode-only" alt="AXEtract Logo" width="400">
|
|
52
|
+
<img src="docs/assets/logo-black-mode.svg#gh-dark-mode-only" alt="AXEtract Logo" width="400">
|
|
53
|
+
<h1>AXEtract</h1>
|
|
54
|
+
<h3>Low-Cost Cross-Domain Web Structured Information Extraction</h3>
|
|
55
|
+
|
|
56
|
+
[!\[Documentation\](https://img.shields.io/badge/docs-latest-teal null)](https://abdo-mansour.github.io/axetract/)
|
|
57
|
+
[!\[License: MIT\](https://img.shields.io/badge/License-MIT-yellow.svg null)](https://opensource.org/licenses/MIT)
|
|
58
|
+
[!\[GitHub\](https://img.shields.io/github/stars/abdo-Mansour/axetract?style=social null)](https://github.com/abdo-Mansour/axetract)
|
|
59
|
+
|
|
60
|
+
</div>
|
|
61
|
+
|
|
62
|
+
***
|
|
63
|
+
|
|
64
|
+
**AXEtract** is a high-performance, low-cost framework for extracting structured data from web pages. Based on the paper **"AXE: Low-Cost Cross-Domain Web Structured Information Extraction"**, it optimizes the extraction pipeline by using specialized LoRA adapters for pruning and query-specific extraction, enabling state-of-the-art results with small models (e.g., Qwen3-0.6B).
|
|
65
|
+
|
|
66
|
+
## 🚀 Key Features
|
|
67
|
+
|
|
68
|
+
- **🎯 Specialized LoRA Adapters**: Uses task-specific adapters for DOM pruning and structured extraction, achieving high accuracy with minimal token overhead.
|
|
69
|
+
- **✂️ Smart DOM Pruning**: Classifies and prunes irrelevant HTML nodes before passing them to the extractor, significantly reducing context window usage and costs.
|
|
70
|
+
- **📍 Grounded XPath Resolution (GXR)**: Automatically maps extracted JSON fields back to their original source XPaths in the DOM for verification and grounding.
|
|
71
|
+
- **⚡ High-Throughput Pipeline**: Built-in support for multiple LLM engines, including **vLLM** for production-grade serving and **HuggingFace** for local research.
|
|
72
|
+
- **🌐 Cross-Domain Versatility**: Designed to generalize across various web domains (e-commerce, real estate, listings) without needing domain-specific rules.
|
|
73
|
+
|
|
74
|
+
## 🛠️ Architecture
|
|
75
|
+
|
|
76
|
+
AXEtract follows a three-part decoupled pipeline for maximum efficiency:
|
|
77
|
+
|
|
78
|
+
1. **Preprocessor**: Fetches raw HTML and chunks it into manageable, token-aware fragments.
|
|
79
|
+
2. **AI Extractor**: Divided into two stages:
|
|
80
|
+
- **Pruner**: A lightweight LLM (LoRA-powered) filters out noise and selects only relevant HTML chunks.
|
|
81
|
+
- **Extractor**: A task-specific LLM maps the pruned HTML content directly to a structured JSON schema or natural language answer.
|
|
82
|
+
3. **Postprocessor**: Validates the output and resolves source XPaths via Grounded XPath Resolution (GXR).
|
|
83
|
+
|
|
84
|
+
## 📦 Installation
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
# Install from PyPI
|
|
88
|
+
uv pip install axetract
|
|
89
|
+
|
|
90
|
+
# Or install from source
|
|
91
|
+
git clone https://github.com/abdo-Mansour/axetract.git
|
|
92
|
+
cd axetract
|
|
93
|
+
uv sync
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## 🚥 Quick Start
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from pydantic import BaseModel
|
|
100
|
+
from axetract.pipeline import AXEPipeline
|
|
101
|
+
|
|
102
|
+
# 1. Initialize the pipeline with default LoRA adapters
|
|
103
|
+
# (Automatically downloads adapters from HuggingFace Hub)
|
|
104
|
+
pipeline = AXEPipeline.from_config(use_vllm=False)
|
|
105
|
+
|
|
106
|
+
# 2. Define your desired extraction schema
|
|
107
|
+
class Product(BaseModel):
|
|
108
|
+
name: str
|
|
109
|
+
price: str
|
|
110
|
+
rating: float
|
|
111
|
+
|
|
112
|
+
# 3. Extract from a URL or raw HTML
|
|
113
|
+
url = "https://example.com/item/12345"
|
|
114
|
+
result = pipeline.extract(url, schema=Product)
|
|
115
|
+
|
|
116
|
+
# 4. Access your structured data
|
|
117
|
+
print(f"Status: {result.status}")
|
|
118
|
+
print(f"Prediction: {result.prediction}")
|
|
119
|
+
print(f"Source XPaths: {result.xpaths}")
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## 🌐 API Server
|
|
123
|
+
|
|
124
|
+
AXEtract includes a built-in FastAPI server for high-throughput serving. After installing the package, start it with the installed CLI entry point:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
axe-server
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Or via `python -m` for development installs:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
python -m axetract.server
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Configuration is done via environment variables:
|
|
137
|
+
|
|
138
|
+
| Variable | Default | Description |
|
|
139
|
+
|---|---|---|
|
|
140
|
+
| `AXE_USE_VLLM` | `false` | Set to `true` to use vLLM backend |
|
|
141
|
+
| `AXE_PORT` | `8000` | Port to listen on |
|
|
142
|
+
| `AXE_HOST` | `0.0.0.0` | Host to bind to |
|
|
143
|
+
| `AXE_LOG_FILE` | _(stderr)_ | Optional path to a log file |
|
|
144
|
+
|
|
145
|
+
See `axe_server/client_example.py` for examples of interacting with the API via `requests`.
|
|
146
|
+
|
|
147
|
+
## 📝 Citation
|
|
148
|
+
|
|
149
|
+
If you use AXEtract in your research, please cite our paper:
|
|
150
|
+
|
|
151
|
+
```bibtex
|
|
152
|
+
@misc{mansour2026axe,
|
|
153
|
+
title={AXE: Low-Cost Cross-Domain Web Structured Information Extraction},
|
|
154
|
+
author={Abdelrahman Mansour and Khaled W. Alshaer and Moataz Elsaban},
|
|
155
|
+
year={2026},
|
|
156
|
+
eprint={2602.01838},
|
|
157
|
+
archivePrefix={arXiv},
|
|
158
|
+
primaryClass={cs.CL},
|
|
159
|
+
url={https://arxiv.org/abs/2602.01838},
|
|
160
|
+
}
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## 📜 License
|
|
164
|
+
|
|
165
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
axetract-0.1.0/README.md
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<img src="docs/assets/logo-white-mode.svg#gh-light-mode-only" alt="AXEtract Logo" width="400">
|
|
3
|
+
<img src="docs/assets/logo-black-mode.svg#gh-dark-mode-only" alt="AXEtract Logo" width="400">
|
|
4
|
+
<h1>AXEtract</h1>
|
|
5
|
+
<h3>Low-Cost Cross-Domain Web Structured Information Extraction</h3>
|
|
6
|
+
|
|
7
|
+
[!\[Documentation\](https://img.shields.io/badge/docs-latest-teal null)](https://abdo-mansour.github.io/axetract/)
|
|
8
|
+
[!\[License: MIT\](https://img.shields.io/badge/License-MIT-yellow.svg null)](https://opensource.org/licenses/MIT)
|
|
9
|
+
[!\[GitHub\](https://img.shields.io/github/stars/abdo-Mansour/axetract?style=social null)](https://github.com/abdo-Mansour/axetract)
|
|
10
|
+
|
|
11
|
+
</div>
|
|
12
|
+
|
|
13
|
+
***
|
|
14
|
+
|
|
15
|
+
**AXEtract** is a high-performance, low-cost framework for extracting structured data from web pages. Based on the paper **"AXE: Low-Cost Cross-Domain Web Structured Information Extraction"**, it optimizes the extraction pipeline by using specialized LoRA adapters for pruning and query-specific extraction, enabling state-of-the-art results with small models (e.g., Qwen3-0.6B).
|
|
16
|
+
|
|
17
|
+
## 🚀 Key Features
|
|
18
|
+
|
|
19
|
+
- **🎯 Specialized LoRA Adapters**: Uses task-specific adapters for DOM pruning and structured extraction, achieving high accuracy with minimal token overhead.
|
|
20
|
+
- **✂️ Smart DOM Pruning**: Classifies and prunes irrelevant HTML nodes before passing them to the extractor, significantly reducing context window usage and costs.
|
|
21
|
+
- **📍 Grounded XPath Resolution (GXR)**: Automatically maps extracted JSON fields back to their original source XPaths in the DOM for verification and grounding.
|
|
22
|
+
- **⚡ High-Throughput Pipeline**: Built-in support for multiple LLM engines, including **vLLM** for production-grade serving and **HuggingFace** for local research.
|
|
23
|
+
- **🌐 Cross-Domain Versatility**: Designed to generalize across various web domains (e-commerce, real estate, listings) without needing domain-specific rules.
|
|
24
|
+
|
|
25
|
+
## 🛠️ Architecture
|
|
26
|
+
|
|
27
|
+
AXEtract follows a three-part decoupled pipeline for maximum efficiency:
|
|
28
|
+
|
|
29
|
+
1. **Preprocessor**: Fetches raw HTML and chunks it into manageable, token-aware fragments.
|
|
30
|
+
2. **AI Extractor**: Divided into two stages:
|
|
31
|
+
- **Pruner**: A lightweight LLM (LoRA-powered) filters out noise and selects only relevant HTML chunks.
|
|
32
|
+
- **Extractor**: A task-specific LLM maps the pruned HTML content directly to a structured JSON schema or natural language answer.
|
|
33
|
+
3. **Postprocessor**: Validates the output and resolves source XPaths via Grounded XPath Resolution (GXR).
|
|
34
|
+
|
|
35
|
+
## 📦 Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
# Install from PyPI
|
|
39
|
+
uv pip install axetract
|
|
40
|
+
|
|
41
|
+
# Or install from source
|
|
42
|
+
git clone https://github.com/abdo-Mansour/axetract.git
|
|
43
|
+
cd axetract
|
|
44
|
+
uv sync
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## 🚥 Quick Start
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from pydantic import BaseModel
|
|
51
|
+
from axetract.pipeline import AXEPipeline
|
|
52
|
+
|
|
53
|
+
# 1. Initialize the pipeline with default LoRA adapters
|
|
54
|
+
# (Automatically downloads adapters from HuggingFace Hub)
|
|
55
|
+
pipeline = AXEPipeline.from_config(use_vllm=False)
|
|
56
|
+
|
|
57
|
+
# 2. Define your desired extraction schema
|
|
58
|
+
class Product(BaseModel):
|
|
59
|
+
name: str
|
|
60
|
+
price: str
|
|
61
|
+
rating: float
|
|
62
|
+
|
|
63
|
+
# 3. Extract from a URL or raw HTML
|
|
64
|
+
url = "https://example.com/item/12345"
|
|
65
|
+
result = pipeline.extract(url, schema=Product)
|
|
66
|
+
|
|
67
|
+
# 4. Access your structured data
|
|
68
|
+
print(f"Status: {result.status}")
|
|
69
|
+
print(f"Prediction: {result.prediction}")
|
|
70
|
+
print(f"Source XPaths: {result.xpaths}")
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## 🌐 API Server
|
|
74
|
+
|
|
75
|
+
AXEtract includes a built-in FastAPI server for high-throughput serving. After installing the package, start it with the installed CLI entry point:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
axe-server
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Or via `python -m` for development installs:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
python -m axetract.server
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Configuration is done via environment variables:
|
|
88
|
+
|
|
89
|
+
| Variable | Default | Description |
|
|
90
|
+
|---|---|---|
|
|
91
|
+
| `AXE_USE_VLLM` | `false` | Set to `true` to use vLLM backend |
|
|
92
|
+
| `AXE_PORT` | `8000` | Port to listen on |
|
|
93
|
+
| `AXE_HOST` | `0.0.0.0` | Host to bind to |
|
|
94
|
+
| `AXE_LOG_FILE` | _(stderr)_ | Optional path to a log file |
|
|
95
|
+
|
|
96
|
+
See `axe_server/client_example.py` for examples of interacting with the API via `requests`.
|
|
97
|
+
|
|
98
|
+
## 📝 Citation
|
|
99
|
+
|
|
100
|
+
If you use AXEtract in your research, please cite our paper:
|
|
101
|
+
|
|
102
|
+
```bibtex
|
|
103
|
+
@misc{mansour2026axe,
|
|
104
|
+
title={AXE: Low-Cost Cross-Domain Web Structured Information Extraction},
|
|
105
|
+
author={Abdelrahman Mansour and Khaled W. Alshaer and Moataz Elsaban},
|
|
106
|
+
year={2026},
|
|
107
|
+
eprint={2602.01838},
|
|
108
|
+
archivePrefix={arXiv},
|
|
109
|
+
primaryClass={cs.CL},
|
|
110
|
+
url={https://arxiv.org/abs/2602.01838},
|
|
111
|
+
}
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## 📜 License
|
|
115
|
+
|
|
116
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "axetract"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Low-Cost Cross-Domain Web Structured Information Extraction using specialized LoRA adapters."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
license = { file = "LICENSE" }
|
|
8
|
+
authors = [
|
|
9
|
+
{name="Abdelrahman Mansour", email = "abdelrahman.f.mansour@gmail.com"},
|
|
10
|
+
{name="Khaled Alshaer", email = "khaled.w.alshaer@gmail.com"},
|
|
11
|
+
]
|
|
12
|
+
keywords = ["web-extraction", "llm", "structured-data", "html-pruning", "lora", "vllm"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 4 - Beta",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
dependencies = [
|
|
21
|
+
"fastapi>=0.135.1",
|
|
22
|
+
"html-chunking>=0.0.4",
|
|
23
|
+
"htmlrag>=0.1.1",
|
|
24
|
+
"json-repair>=0.58.1",
|
|
25
|
+
"peft>=0.18.1",
|
|
26
|
+
"pydantic>=2.13.0b2",
|
|
27
|
+
"torch>=2.9.1",
|
|
28
|
+
"transformers>=4.57.6",
|
|
29
|
+
"uvicorn>=0.41.0",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
vllm = [
|
|
34
|
+
"vllm>=0.16.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Repository = "https://github.com/abdo-Mansour/axetract"
|
|
39
|
+
Documentation = "https://abdo-mansour.github.io/axetract/"
|
|
40
|
+
"Issue Tracker" = "https://github.com/abdo-Mansour/axetract/issues"
|
|
41
|
+
|
|
42
|
+
[project.scripts]
|
|
43
|
+
axe-server = "axetract.server:main"
|
|
44
|
+
|
|
45
|
+
[dependency-groups]
|
|
46
|
+
dev = [
|
|
47
|
+
"mkdocs>=1.6.1,<2.0.0",
|
|
48
|
+
"pytest>=8.0.0",
|
|
49
|
+
"mkdocs-material>=9.7.3",
|
|
50
|
+
"mkdocs-autoapi[python]",
|
|
51
|
+
"mkdocstrings[python]>=1.0.3",
|
|
52
|
+
"ruff>=0.15.4",
|
|
53
|
+
"jupyter>=1.1.1",
|
|
54
|
+
"notebook>=7.5.4",
|
|
55
|
+
"ipywidgets>=8.1.8",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
[tool.ruff]
|
|
59
|
+
line-length = 100
|
|
60
|
+
|
|
61
|
+
[tool.ruff.lint]
|
|
62
|
+
select = ["E", "F", "I", "N", "W", "D"]
|
|
63
|
+
ignore = ["D100", "D104", "E501"] # Ignore missing docstrings in modules and packages, and long lines
|
|
64
|
+
|
|
65
|
+
[tool.ruff.lint.per-file-ignores]
|
|
66
|
+
"tests/*" = ["D1"] # Don't require docstrings in tests
|
|
67
|
+
"*.ipynb" = ["W291"] # Ignore trailing whitespace in notebooks
|
|
68
|
+
"*.py" = ["E402"]
|
|
69
|
+
|
|
70
|
+
[tool.ruff.lint.pydocstyle]
|
|
71
|
+
convention = "google"
|
|
72
|
+
|
|
73
|
+
[tool.ruff.lint.isort]
|
|
74
|
+
known-first-party = ["axetract"]
|
|
75
|
+
|
|
76
|
+
[build-system]
|
|
77
|
+
requires = ["uv_build>=0.10.6,<0.11.0"]
|
|
78
|
+
build-backend = "uv_build"
|
|
79
|
+
|
|
80
|
+
[tool.uv.build-backend]
|
|
81
|
+
module-name = "axetract"
|
|
82
|
+
module-root = "src"
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""AXEtract: Low-Cost Cross-Domain Web Structured Information Extraction."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
4
|
+
|
|
5
|
+
from axetract.data_types import AXEResult, AXESample, Status
|
|
6
|
+
from axetract.pipeline import AXEPipeline
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
__version__ = version("axetract")
|
|
10
|
+
except PackageNotFoundError: # editable install not yet built
|
|
11
|
+
__version__ = "unknown"
|
|
12
|
+
|
|
13
|
+
__all__ = ["AXEPipeline", "AXESample", "AXEResult", "Status", "__version__"]
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Any, List, Optional, Type, Union
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Status(Enum):
|
|
10
|
+
"""Execution status for a processing sample."""
|
|
11
|
+
|
|
12
|
+
PENDING = "pending"
|
|
13
|
+
SUCCESS = "success"
|
|
14
|
+
FAILED = "failed"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AXEChunk(BaseModel):
|
|
18
|
+
"""A single chunk of HTML content.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
chunkid (str): Unique identifier for the chunk.
|
|
22
|
+
content (str): The raw or cleaned HTML content.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
chunkid: str
|
|
26
|
+
content: str
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class AXESample(BaseModel):
|
|
30
|
+
"""A data container for a single extraction request throughout the pipeline.
|
|
31
|
+
|
|
32
|
+
Attributes:
|
|
33
|
+
id (str): Unique identifier for the sample.
|
|
34
|
+
content (str): Input content (URL or raw HTML).
|
|
35
|
+
is_content_url (bool): Whether the content is a URL.
|
|
36
|
+
query (Optional[str]): Natural language extraction query.
|
|
37
|
+
schema_model (Optional[Union[str, Type[BaseModel], dict]]): Desired JSON schema.
|
|
38
|
+
chunks (List[AXEChunk]): List of processed HTML chunks.
|
|
39
|
+
original_html (str): The original, uncleaned HTML content.
|
|
40
|
+
current_html (str): The current state of HTML (e.g., after cleaning or pruning).
|
|
41
|
+
prediction (Optional[Union[str, dict, Any]]): The LLM's raw output or parsed JSON.
|
|
42
|
+
xpaths (Optional[dict]): Map of extracted fields to their source XPaths.
|
|
43
|
+
status (Status): Current processing status.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
id: str
|
|
47
|
+
content: str
|
|
48
|
+
is_content_url: bool
|
|
49
|
+
query: Optional[str] = None
|
|
50
|
+
schema_model: Optional[Union[str, Type[BaseModel], dict]] = None
|
|
51
|
+
chunks: List[AXEChunk] = []
|
|
52
|
+
original_html: str = ""
|
|
53
|
+
current_html: str = ""
|
|
54
|
+
prediction: Optional[Union[str, dict, Any]] = None
|
|
55
|
+
xpaths: Optional[dict] = None
|
|
56
|
+
|
|
57
|
+
status: Status = Status.PENDING
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class AXEResult(BaseModel):
|
|
61
|
+
"""Final extraction result returned to the user.
|
|
62
|
+
|
|
63
|
+
Attributes:
|
|
64
|
+
id (str): Sample identifier.
|
|
65
|
+
prediction (Union[str, dict, Any]): The extracted structured data.
|
|
66
|
+
xpaths (Optional[dict]): Reference XPaths for the extracted values.
|
|
67
|
+
status (Status): Success or failure indicator.
|
|
68
|
+
error (Optional[str]): Error message if processing failed.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
id: str
|
|
72
|
+
prediction: Union[str, dict, Any]
|
|
73
|
+
xpaths: Optional[dict] = None
|
|
74
|
+
status: Status
|
|
75
|
+
error: Optional[str] = None
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Custom exceptions for the Axetract package."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class AXEError(Exception):
|
|
5
|
+
"""Base class for all Axetract exceptions."""
|
|
6
|
+
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class PreprocessingError(AXEError):
|
|
11
|
+
"""Raised when an error occurs during the preprocessing stage."""
|
|
12
|
+
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PruningError(AXEError):
|
|
17
|
+
"""Raised when an error occurs during the pruning stage."""
|
|
18
|
+
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ExtractionError(AXEError):
|
|
23
|
+
"""Raised when an error occurs during the extraction stage."""
|
|
24
|
+
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class PostprocessingError(AXEError):
|
|
29
|
+
"""Raised when an error occurs during the postprocessing stage."""
|
|
30
|
+
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ModelLoadError(AXEError):
|
|
35
|
+
"""Raised when an error occurs while loading an LLM model."""
|
|
36
|
+
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ConfigurationError(AXEError):
|
|
41
|
+
"""Raised when there is an invalid configuration for the pipeline."""
|
|
42
|
+
|
|
43
|
+
pass
|
|
File without changes
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
from axetract.data_types import AXESample, Status
|
|
8
|
+
from axetract.extractor.base_extractor import BaseExtractor
|
|
9
|
+
from axetract.llm.base_client import BaseClient
|
|
10
|
+
from axetract.utils.json_util import is_schema
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AXEExtractor(BaseExtractor):
|
|
16
|
+
"""Component for extracting structured data from HTML using LLMs.
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
llm_extractor_client (BaseClient): The LLM client used for extraction.
|
|
20
|
+
schema_prompt_template (str): Template for schema-based extraction prompts.
|
|
21
|
+
query_prompt_template (str): Template for natural language query prompts.
|
|
22
|
+
name (str): Component name.
|
|
23
|
+
batch_size (int): Processing batch size.
|
|
24
|
+
num_workers (int): Number of parallel workers.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
llm_extractor_client: BaseClient,
|
|
30
|
+
schema_generation_prompt_template: str,
|
|
31
|
+
query_generation_prompt_template: str,
|
|
32
|
+
name: str = "axe_extractor",
|
|
33
|
+
batch_size: int = 16,
|
|
34
|
+
num_workers: int = 4,
|
|
35
|
+
):
|
|
36
|
+
"""Initialize the extractor.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
llm_extractor_client (BaseClient): LLM client.
|
|
40
|
+
schema_generation_prompt_template (str): Schema prompt template.
|
|
41
|
+
query_generation_prompt_template (str): Query prompt template.
|
|
42
|
+
name (str): Component name.
|
|
43
|
+
batch_size (int): Batch size.
|
|
44
|
+
num_workers (int): Parallel workers.
|
|
45
|
+
"""
|
|
46
|
+
self.llm_extractor_client = llm_extractor_client
|
|
47
|
+
self.name = name
|
|
48
|
+
self.batch_size = batch_size
|
|
49
|
+
self.num_workers = num_workers
|
|
50
|
+
self.schema_prompt_template = schema_generation_prompt_template
|
|
51
|
+
self.query_prompt_template = query_generation_prompt_template
|
|
52
|
+
|
|
53
|
+
def _generate_output(self, samples: List[AXESample]) -> List[AXESample]:
|
|
54
|
+
|
|
55
|
+
def build_prompt(data):
|
|
56
|
+
query = data.query or data.schema_model
|
|
57
|
+
content = data.current_html
|
|
58
|
+
|
|
59
|
+
# Convert Query/Schema to appropriate string if it is a dictionary or Pydantic model
|
|
60
|
+
if query is not None and not isinstance(query, str):
|
|
61
|
+
if isinstance(query, dict):
|
|
62
|
+
query = json.dumps(query)
|
|
63
|
+
elif isinstance(query, type) and issubclass(query, BaseModel):
|
|
64
|
+
# For Pydantic V2 use model_json_schema, for V1 use schema_json
|
|
65
|
+
if hasattr(query, "model_json_schema"):
|
|
66
|
+
query = json.dumps(query.model_json_schema())
|
|
67
|
+
elif hasattr(query, "schema_json"):
|
|
68
|
+
query = query.schema_json()
|
|
69
|
+
|
|
70
|
+
if is_schema(query):
|
|
71
|
+
return self.schema_prompt_template.format(query=query, content=content)
|
|
72
|
+
else:
|
|
73
|
+
return self.query_prompt_template.format(query=query, content=content)
|
|
74
|
+
|
|
75
|
+
prompts = [build_prompt(sample) for sample in samples]
|
|
76
|
+
queries = [sample.query or sample.schema_model for sample in samples]
|
|
77
|
+
|
|
78
|
+
# Storage for split batches
|
|
79
|
+
qa_indices = []
|
|
80
|
+
qa_prompts = []
|
|
81
|
+
|
|
82
|
+
schema_indices = []
|
|
83
|
+
schema_prompts = []
|
|
84
|
+
|
|
85
|
+
# 1. Split based on Query Type
|
|
86
|
+
for idx, (q, p) in enumerate(zip(queries, prompts)):
|
|
87
|
+
if is_schema(q):
|
|
88
|
+
schema_indices.append(idx)
|
|
89
|
+
schema_prompts.append(p)
|
|
90
|
+
else:
|
|
91
|
+
qa_indices.append(idx)
|
|
92
|
+
qa_prompts.append(p)
|
|
93
|
+
|
|
94
|
+
# Holder for final results in original order
|
|
95
|
+
final_responses = [None] * len(prompts)
|
|
96
|
+
|
|
97
|
+
# 2. Run QA Batch (Adapter: "qa")
|
|
98
|
+
if qa_prompts:
|
|
99
|
+
logger.debug("Processing %d QA queries...", len(qa_prompts))
|
|
100
|
+
for idx, (orig_idx, q) in enumerate(zip(qa_indices, queries)):
|
|
101
|
+
if not is_schema(q):
|
|
102
|
+
logger.debug(" [QA] sample %d query: %s", orig_idx, q)
|
|
103
|
+
logger.debug(" [QA] sample %d prompt: %s", orig_idx, qa_prompts[idx])
|
|
104
|
+
qa_responses = self.llm_extractor_client.call_batch(qa_prompts, adapter_name="qa")
|
|
105
|
+
|
|
106
|
+
for original_idx, response in zip(qa_indices, qa_responses):
|
|
107
|
+
logger.debug(" [QA] sample %d response: %s", original_idx, response)
|
|
108
|
+
final_responses[original_idx] = response
|
|
109
|
+
|
|
110
|
+
# 3. Run Schema Batch (Adapter: "schema")
|
|
111
|
+
if schema_prompts:
|
|
112
|
+
logger.debug("Processing %d Schema queries...", len(schema_prompts))
|
|
113
|
+
for idx, (orig_idx, q) in enumerate(zip(schema_indices, queries)):
|
|
114
|
+
if is_schema(q):
|
|
115
|
+
logger.debug(" [Schema] sample %d schema: %s", orig_idx, q)
|
|
116
|
+
logger.debug(" [Schema] sample %d prompt: %s", orig_idx, schema_prompts[idx])
|
|
117
|
+
schema_responses = self.llm_extractor_client.call_batch(
|
|
118
|
+
schema_prompts, adapter_name="schema"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
for original_idx, response in zip(schema_indices, schema_responses):
|
|
122
|
+
logger.debug(" [Schema] sample %d response: %s", original_idx, response)
|
|
123
|
+
final_responses[original_idx] = response
|
|
124
|
+
|
|
125
|
+
for sample, response in zip(samples, final_responses):
|
|
126
|
+
sample.prediction = response
|
|
127
|
+
sample.status = Status.SUCCESS if response is not None else Status.FAILED
|
|
128
|
+
logger.debug(" [Extractor] sample %s final prediction: %s", sample.id, response)
|
|
129
|
+
return samples
|
|
130
|
+
|
|
131
|
+
def __call__(self, samples: List[AXESample]) -> List[AXESample]:
|
|
132
|
+
"""Run the extraction process on a batch of samples.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
samples (List[AXESample]): Input samples with clean HTML.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
List[AXESample]: Samples with LLM-generated predictions.
|
|
139
|
+
"""
|
|
140
|
+
# Step 3: Generate (Optimized Parallel)
|
|
141
|
+
generated_samples = self._generate_output(samples)
|
|
142
|
+
|
|
143
|
+
return generated_samples
|