py-web-text-extractor 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_web_text_extractor-0.1.0/PKG-INFO +192 -0
- py_web_text_extractor-0.1.0/README.md +166 -0
- py_web_text_extractor-0.1.0/pyproject.toml +132 -0
- py_web_text_extractor-0.1.0/src/py_web_text_extractor/__init__.py +27 -0
- py_web_text_extractor-0.1.0/src/py_web_text_extractor/abstract/__init__.py +9 -0
- py_web_text_extractor-0.1.0/src/py_web_text_extractor/abstract/extractor.py +85 -0
- py_web_text_extractor-0.1.0/src/py_web_text_extractor/cli.py +84 -0
- py_web_text_extractor-0.1.0/src/py_web_text_extractor/exception/__init__.py +21 -0
- py_web_text_extractor-0.1.0/src/py_web_text_extractor/exception/exceptions.py +21 -0
- py_web_text_extractor-0.1.0/src/py_web_text_extractor/main.py +27 -0
- py_web_text_extractor-0.1.0/src/py_web_text_extractor/py.typed +0 -0
- py_web_text_extractor-0.1.0/src/py_web_text_extractor/service/__init__.py +12 -0
- py_web_text_extractor-0.1.0/src/py_web_text_extractor/service/extractor_service.py +112 -0
- py_web_text_extractor-0.1.0/src/py_web_text_extractor/service/markitdown_extractor.py +45 -0
- py_web_text_extractor-0.1.0/src/py_web_text_extractor/service/trafilatura_extractor.py +54 -0
- py_web_text_extractor-0.1.0/src/py_web_text_extractor/tools/__init__.py +10 -0
- py_web_text_extractor-0.1.0/src/py_web_text_extractor/tools/validation.py +66 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: py-web-text-extractor
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Library-wrapper for extracting text from Web Pages
|
|
5
|
+
Keywords: web,text,extraction,scraping,html,markdown,trafilatura,markitdown
|
|
6
|
+
Author: Oleksandr Kostenko
|
|
7
|
+
Author-email: Oleksandr Kostenko <sanyokkua@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
16
|
+
Classifier: Topic :: Text Processing
|
|
17
|
+
Classifier: Typing :: Typed
|
|
18
|
+
Requires-Dist: markitdown>=0.0.2
|
|
19
|
+
Requires-Dist: trafilatura>=2.0.0
|
|
20
|
+
Requires-Dist: typer>=0.12.0
|
|
21
|
+
Requires-Python: >=3.14
|
|
22
|
+
Project-URL: Homepage, https://github.com/sanyokkua/py_web_text_extractor
|
|
23
|
+
Project-URL: Repository, https://github.com/sanyokkua/py_web_text_extractor
|
|
24
|
+
Project-URL: Issues, https://github.com/sanyokkua/py_web_text_extractor/issues
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# py-web-text-extractor
|
|
28
|
+
|
|
29
|
+
[](https://github.com/sanyokkua/py_web_text_extractor/actions)
|
|
30
|
+
[](https://pypi.org/project/py-web-text-extractor/)
|
|
31
|
+
[](https://www.python.org/downloads/)
|
|
32
|
+
[](https://opensource.org/licenses/MIT)
|
|
33
|
+
|
|
34
|
+
A CLI tool and Python library to extract clean text content from web pages.
|
|
35
|
+
|
|
36
|
+
## Overview
|
|
37
|
+
|
|
38
|
+
`py-web-text-extractor` provides a simple interface for extracting the main text content from HTML documents. It can be used as a command-line tool for quick extractions or as a Python library for integration into other applications. The tool uses a fallback strategy, trying `markitdown` first and then `trafilatura` to ensure high reliability.
|
|
39
|
+
|
|
40
|
+
## Features
|
|
41
|
+
|
|
42
|
+
- **Dual Extractor Strategy**: Uses `markitdown` as the primary extractor and falls back to `trafilatura` for robustness.
|
|
43
|
+
- **CLI and Library Interface**: Can be used as a standalone command-line tool or as a Python library.
|
|
44
|
+
- **Error Handling Modes**: Supports a strict mode that raises exceptions on failure and a safe mode that returns an empty string.
|
|
45
|
+
- **Modern Python**: Fully typed with Python 3.14+ support.
|
|
46
|
+
|
|
47
|
+
## Prerequisites
|
|
48
|
+
|
|
49
|
+
- Python 3.14 or higher.
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
You can install the package using `pip` or `uv`.
|
|
54
|
+
|
|
55
|
+
### Using pip
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install py-web-text-extractor
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Using uv
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
uv add py-web-text-extractor
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Usage
|
|
68
|
+
|
|
69
|
+
The tool can be used via its command-line interface or as a Python library.
|
|
70
|
+
|
|
71
|
+
### Command-Line Interface (CLI)
|
|
72
|
+
|
|
73
|
+
The CLI is the quickest way to extract text from a URL.
|
|
74
|
+
|
|
75
|
+
**Basic Extraction:**
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
py-web-text-extractor https://example.com
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**Safe Mode:**
|
|
82
|
+
|
|
83
|
+
In safe mode, the tool will return an empty string and exit gracefully if an error occurs.
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
py-web-text-extractor https://example.com --safe
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
**Verbose Mode:**
|
|
90
|
+
|
|
91
|
+
For troubleshooting, verbose mode provides detailed debug output.
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
py-web-text-extractor https://example.com --verbose
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**CLI Exit Codes:**
|
|
98
|
+
|
|
99
|
+
| Code | Meaning |
|
|
100
|
+
| ---- | ---------------------- |
|
|
101
|
+
| 0 | Success |
|
|
102
|
+
| 1 | No text content found |
|
|
103
|
+
| 2 | Invalid URL |
|
|
104
|
+
| 3 | Text extraction failed |
|
|
105
|
+
| 4 | Unexpected error |
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
### Python Library
|
|
109
|
+
|
|
110
|
+
For programmatic use, you can import the `ExtractorService`.
|
|
111
|
+
|
|
112
|
+
**Quick Start:**
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
from py_web_text_extractor.service.extractor_service import ExtractorService
|
|
116
|
+
from py_web_text_extractor.exception.exceptions import TextExtractionError, UrlIsNotValidException
|
|
117
|
+
|
|
118
|
+
# Initialize the service
|
|
119
|
+
service = ExtractorService()
|
|
120
|
+
|
|
121
|
+
# Strict mode: raises an exception on failure
|
|
122
|
+
try:
|
|
123
|
+
text = service.extract_text_from_page("https://example.com")
|
|
124
|
+
print(text)
|
|
125
|
+
except UrlIsNotValidException:
|
|
126
|
+
print("The provided URL is not valid.")
|
|
127
|
+
except TextExtractionError as e:
|
|
128
|
+
print(f"Failed to extract text: {e}")
|
|
129
|
+
|
|
130
|
+
# Safe mode: returns an empty string on failure
|
|
131
|
+
text_safe = service.extract_text_from_page_safe("https://invalid-url")
|
|
132
|
+
if not text_safe:
|
|
133
|
+
print("Extraction failed or no content found.")
|
|
134
|
+
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## API Reference
|
|
138
|
+
|
|
139
|
+
### `ExtractorService`
|
|
140
|
+
|
|
141
|
+
The main class for handling text extraction.
|
|
142
|
+
|
|
143
|
+
**Methods:**
|
|
144
|
+
|
|
145
|
+
- **`extract_text_from_page(url: str) -> str`**: Extracts text from the given URL. Raises a `TextExtractionError` or `UrlIsNotValidException` on failure.
|
|
146
|
+
- **`extract_text_from_page_safe(url: str) -> str`**: Extracts text from the given URL. Returns an empty string on failure.
|
|
147
|
+
|
|
148
|
+
### Exceptions
|
|
149
|
+
|
|
150
|
+
The library uses a set of custom exceptions to allow for specific error handling.
|
|
151
|
+
|
|
152
|
+
- `TextExtractionError`: Base exception for the library.
|
|
153
|
+
- `UrlIsNotValidException`: Raised for invalid URL formats.
|
|
154
|
+
- `TextExtractionFailure`: Raised when all extraction attempts fail.
|
|
155
|
+
- `MarkItDownExtractionException`: Specific failure from the `markitdown` extractor.
|
|
156
|
+
- `TrafilaturaExtractionException`: Specific failure from the `trafilatura` extractor.
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
## Architecture
|
|
160
|
+
|
|
161
|
+
The service employs a fallback strategy to maximize reliability:
|
|
162
|
+
1. It first attempts to extract content using `markitdown`.
|
|
163
|
+
2. If `markitdown` fails (e.g., returns a blank string or raises an error), the service automatically retries the extraction using `trafilatura`.
|
|
164
|
+
3. The first successful result is returned. If both extractors fail, an error is raised or an empty string is returned, depending on the mode.
|
|
165
|
+
|
|
166
|
+
## Testing
|
|
167
|
+
|
|
168
|
+
To run the test suite, first install the development dependencies and then run `pytest`.
|
|
169
|
+
|
|
170
|
+
> Patch pydub for Python 3.14 compatibility
|
|
171
|
+
> For Python 3.14 can be required patching of old dependency
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
python scripts/patch_pydub.py
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
# Install dev dependencies
|
|
179
|
+
uv pip install -e ".[dev]"
|
|
180
|
+
|
|
181
|
+
# Run tests
|
|
182
|
+
uv run pytest
|
|
183
|
+
|
|
184
|
+
# Run linters and type checkers
|
|
185
|
+
uv run ruff check .
|
|
186
|
+
uv run ruff format .
|
|
187
|
+
uv run mypy src/
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## License
|
|
191
|
+
|
|
192
|
+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# py-web-text-extractor
|
|
2
|
+
|
|
3
|
+
[](https://github.com/sanyokkua/py_web_text_extractor/actions)
|
|
4
|
+
[](https://pypi.org/project/py-web-text-extractor/)
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
|
|
8
|
+
A CLI tool and Python library to extract clean text content from web pages.
|
|
9
|
+
|
|
10
|
+
## Overview
|
|
11
|
+
|
|
12
|
+
`py-web-text-extractor` provides a simple interface for extracting the main text content from HTML documents. It can be used as a command-line tool for quick extractions or as a Python library for integration into other applications. The tool uses a fallback strategy, trying `markitdown` first and then `trafilatura` to ensure high reliability.
|
|
13
|
+
|
|
14
|
+
## Features
|
|
15
|
+
|
|
16
|
+
- **Dual Extractor Strategy**: Uses `markitdown` as the primary extractor and falls back to `trafilatura` for robustness.
|
|
17
|
+
- **CLI and Library Interface**: Can be used as a standalone command-line tool or as a Python library.
|
|
18
|
+
- **Error Handling Modes**: Supports a strict mode that raises exceptions on failure and a safe mode that returns an empty string.
|
|
19
|
+
- **Modern Python**: Fully typed with Python 3.14+ support.
|
|
20
|
+
|
|
21
|
+
## Prerequisites
|
|
22
|
+
|
|
23
|
+
- Python 3.14 or higher.
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
You can install the package using `pip` or `uv`.
|
|
28
|
+
|
|
29
|
+
### Using pip
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install py-web-text-extractor
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Using uv
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
uv add py-web-text-extractor
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Usage
|
|
42
|
+
|
|
43
|
+
The tool can be used via its command-line interface or as a Python library.
|
|
44
|
+
|
|
45
|
+
### Command-Line Interface (CLI)
|
|
46
|
+
|
|
47
|
+
The CLI is the quickest way to extract text from a URL.
|
|
48
|
+
|
|
49
|
+
**Basic Extraction:**
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
py-web-text-extractor https://example.com
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
**Safe Mode:**
|
|
56
|
+
|
|
57
|
+
In safe mode, the tool will return an empty string and exit gracefully if an error occurs.
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
py-web-text-extractor https://example.com --safe
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
**Verbose Mode:**
|
|
64
|
+
|
|
65
|
+
For troubleshooting, verbose mode provides detailed debug output.
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
py-web-text-extractor https://example.com --verbose
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
**CLI Exit Codes:**
|
|
72
|
+
|
|
73
|
+
| Code | Meaning |
|
|
74
|
+
| ---- | ---------------------- |
|
|
75
|
+
| 0 | Success |
|
|
76
|
+
| 1 | No text content found |
|
|
77
|
+
| 2 | Invalid URL |
|
|
78
|
+
| 3 | Text extraction failed |
|
|
79
|
+
| 4 | Unexpected error |
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
### Python Library
|
|
83
|
+
|
|
84
|
+
For programmatic use, you can import the `ExtractorService`.
|
|
85
|
+
|
|
86
|
+
**Quick Start:**
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from py_web_text_extractor.service.extractor_service import ExtractorService
|
|
90
|
+
from py_web_text_extractor.exception.exceptions import TextExtractionError, UrlIsNotValidException
|
|
91
|
+
|
|
92
|
+
# Initialize the service
|
|
93
|
+
service = ExtractorService()
|
|
94
|
+
|
|
95
|
+
# Strict mode: raises an exception on failure
|
|
96
|
+
try:
|
|
97
|
+
text = service.extract_text_from_page("https://example.com")
|
|
98
|
+
print(text)
|
|
99
|
+
except UrlIsNotValidException:
|
|
100
|
+
print("The provided URL is not valid.")
|
|
101
|
+
except TextExtractionError as e:
|
|
102
|
+
print(f"Failed to extract text: {e}")
|
|
103
|
+
|
|
104
|
+
# Safe mode: returns an empty string on failure
|
|
105
|
+
text_safe = service.extract_text_from_page_safe("https://invalid-url")
|
|
106
|
+
if not text_safe:
|
|
107
|
+
print("Extraction failed or no content found.")
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## API Reference
|
|
112
|
+
|
|
113
|
+
### `ExtractorService`
|
|
114
|
+
|
|
115
|
+
The main class for handling text extraction.
|
|
116
|
+
|
|
117
|
+
**Methods:**
|
|
118
|
+
|
|
119
|
+
- **`extract_text_from_page(url: str) -> str`**: Extracts text from the given URL. Raises a `TextExtractionError` or `UrlIsNotValidException` on failure.
|
|
120
|
+
- **`extract_text_from_page_safe(url: str) -> str`**: Extracts text from the given URL. Returns an empty string on failure.
|
|
121
|
+
|
|
122
|
+
### Exceptions
|
|
123
|
+
|
|
124
|
+
The library uses a set of custom exceptions to allow for specific error handling.
|
|
125
|
+
|
|
126
|
+
- `TextExtractionError`: Base exception for the library.
|
|
127
|
+
- `UrlIsNotValidException`: Raised for invalid URL formats.
|
|
128
|
+
- `TextExtractionFailure`: Raised when all extraction attempts fail.
|
|
129
|
+
- `MarkItDownExtractionException`: Specific failure from the `markitdown` extractor.
|
|
130
|
+
- `TrafilaturaExtractionException`: Specific failure from the `trafilatura` extractor.
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
## Architecture
|
|
134
|
+
|
|
135
|
+
The service employs a fallback strategy to maximize reliability:
|
|
136
|
+
1. It first attempts to extract content using `markitdown`.
|
|
137
|
+
2. If `markitdown` fails (e.g., returns a blank string or raises an error), the service automatically retries the extraction using `trafilatura`.
|
|
138
|
+
3. The first successful result is returned. If both extractors fail, an error is raised or an empty string is returned, depending on the mode.
|
|
139
|
+
|
|
140
|
+
## Testing
|
|
141
|
+
|
|
142
|
+
To run the test suite, first install the development dependencies and then run `pytest`.
|
|
143
|
+
|
|
144
|
+
> Patch pydub for Python 3.14 compatibility
|
|
145
|
+
> For Python 3.14 can be required patching of old dependency
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
python scripts/patch_pydub.py
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
# Install dev dependencies
|
|
153
|
+
uv pip install -e ".[dev]"
|
|
154
|
+
|
|
155
|
+
# Run tests
|
|
156
|
+
uv run pytest
|
|
157
|
+
|
|
158
|
+
# Run linters and type checkers
|
|
159
|
+
uv run ruff check .
|
|
160
|
+
uv run ruff format .
|
|
161
|
+
uv run mypy src/
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## License
|
|
165
|
+
|
|
166
|
+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "py-web-text-extractor"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Library-wrapper for extracting text from Web Pages"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [{ name = "Oleksandr Kostenko", email = "sanyokkua@gmail.com" }]
|
|
7
|
+
requires-python = ">=3.14"
|
|
8
|
+
license = { text = "MIT" }
|
|
9
|
+
keywords = [
|
|
10
|
+
"web",
|
|
11
|
+
"text",
|
|
12
|
+
"extraction",
|
|
13
|
+
"scraping",
|
|
14
|
+
"html",
|
|
15
|
+
"markdown",
|
|
16
|
+
"trafilatura",
|
|
17
|
+
"markitdown",
|
|
18
|
+
]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Development Status :: 4 - Beta",
|
|
21
|
+
"Intended Audience :: Developers",
|
|
22
|
+
"License :: OSI Approved :: MIT License",
|
|
23
|
+
"Operating System :: OS Independent",
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
"Programming Language :: Python :: 3.14",
|
|
26
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
27
|
+
"Topic :: Text Processing",
|
|
28
|
+
"Typing :: Typed",
|
|
29
|
+
]
|
|
30
|
+
dependencies = ["markitdown>=0.0.2", "trafilatura>=2.0.0", "typer>=0.12.0"]
|
|
31
|
+
|
|
32
|
+
[project.scripts]
|
|
33
|
+
py-web-text-extractor = "py_web_text_extractor.cli:app"
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/sanyokkua/py_web_text_extractor"
|
|
37
|
+
Repository = "https://github.com/sanyokkua/py_web_text_extractor"
|
|
38
|
+
Issues = "https://github.com/sanyokkua/py_web_text_extractor/issues"
|
|
39
|
+
|
|
40
|
+
[build-system]
|
|
41
|
+
requires = ["uv_build>=0.9.28,<0.10.0"]
|
|
42
|
+
build-backend = "uv_build"
|
|
43
|
+
|
|
44
|
+
[dependency-groups]
|
|
45
|
+
dev = ["pytest>=9.0.2", "pytest-cov>=7.0.0", "ruff>=0.15.0"]
|
|
46
|
+
|
|
47
|
+
[tool.ruff]
|
|
48
|
+
target-version = "py312"
|
|
49
|
+
src = ["src", "tests"]
|
|
50
|
+
line-length = 120
|
|
51
|
+
extend-exclude = ["tests"]
|
|
52
|
+
|
|
53
|
+
[tool.ruff.lint]
|
|
54
|
+
dummy-variable-rgx = "^_$"
|
|
55
|
+
|
|
56
|
+
select = [
|
|
57
|
+
"E", # pycodestyle errors
|
|
58
|
+
"W", # pycodestyle warnings
|
|
59
|
+
"F", # Pyflakes
|
|
60
|
+
"I", # isort
|
|
61
|
+
"B", # flake8-bugbear
|
|
62
|
+
"C4", # flake8-comprehensions
|
|
63
|
+
"UP", # pyupgrade
|
|
64
|
+
"ARG", # flake8-unused-arguments
|
|
65
|
+
"SIM", # flake8-simplify
|
|
66
|
+
"TCH", # flake8-type-checking
|
|
67
|
+
"PTH", # flake8-use-pathlib
|
|
68
|
+
"ERA", # eradicate (commented code)
|
|
69
|
+
"PL", # Pylint
|
|
70
|
+
"RUF", # Ruff-specific rules
|
|
71
|
+
"D", # pydocstyle (docstring linting)
|
|
72
|
+
"ANN", # flake8-annotations (type hints)
|
|
73
|
+
]
|
|
74
|
+
ignore = [
|
|
75
|
+
"E501", # line too long (handled by formatter)
|
|
76
|
+
"PLR0913", # too many arguments
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
[tool.ruff.format]
|
|
80
|
+
docstring-code-format = true
|
|
81
|
+
indent-style = 'space'
|
|
82
|
+
quote-style = "double"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
[tool.ruff.lint.per-file-ignores]
|
|
86
|
+
"tests/**/*.py" = [
|
|
87
|
+
"S101",
|
|
88
|
+
"ARG",
|
|
89
|
+
"PLR2004",
|
|
90
|
+
"D",
|
|
91
|
+
"ANN001",
|
|
92
|
+
"ANN201",
|
|
93
|
+
"ANN202",
|
|
94
|
+
"ANN401",
|
|
95
|
+
"ANN204",
|
|
96
|
+
]
|
|
97
|
+
"**/__init__.py" = ["F401", "D104"]
|
|
98
|
+
|
|
99
|
+
[tool.ruff.lint.isort]
|
|
100
|
+
known-first-party = ["py_web_text_extractor"]
|
|
101
|
+
|
|
102
|
+
[tool.ruff.lint.pydocstyle]
|
|
103
|
+
convention = "google"
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# PYTEST
|
|
107
|
+
[tool.pytest.ini_options]
|
|
108
|
+
minversion = "8.0"
|
|
109
|
+
pythonpath = ["src"]
|
|
110
|
+
testpaths = ["tests"]
|
|
111
|
+
addopts = ["-ra", "-q", "--strict-markers", "--strict-config"]
|
|
112
|
+
markers = ["slow: marks tests as slow", "integration: marks integration tests"]
|
|
113
|
+
filterwarnings = ["error", "ignore::DeprecationWarning"]
|
|
114
|
+
|
|
115
|
+
# COVERAGE
|
|
116
|
+
[tool.coverage.run]
|
|
117
|
+
source = ["src"]
|
|
118
|
+
branch = true
|
|
119
|
+
parallel = true
|
|
120
|
+
omit = ["**/__init__.py"]
|
|
121
|
+
|
|
122
|
+
[tool.coverage.report]
|
|
123
|
+
exclude_lines = [
|
|
124
|
+
"pragma: no cover",
|
|
125
|
+
"def __repr__",
|
|
126
|
+
"raise NotImplementedError",
|
|
127
|
+
"if TYPE_CHECKING:",
|
|
128
|
+
"if __name__ == .__main__.:",
|
|
129
|
+
"@abstractmethod",
|
|
130
|
+
]
|
|
131
|
+
fail_under = 80
|
|
132
|
+
show_missing = true
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Extract clean text content from web pages."""
|
|
2
|
+
|
|
3
|
+
from py_web_text_extractor.exception.exceptions import (
|
|
4
|
+
MarkItDownExtractionException,
|
|
5
|
+
TextExtractionError,
|
|
6
|
+
TextExtractionFailure,
|
|
7
|
+
TrafilaturaExtractionException,
|
|
8
|
+
UrlIsNotValidException,
|
|
9
|
+
)
|
|
10
|
+
from py_web_text_extractor.main import Extractor, ExtractorService, app, create_extractor_service
|
|
11
|
+
from py_web_text_extractor.tools.validation import is_blank_string, is_valid_url
|
|
12
|
+
|
|
13
|
+
__version__ = "0.1.0"
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"Extractor",
|
|
17
|
+
"ExtractorService",
|
|
18
|
+
"MarkItDownExtractionException",
|
|
19
|
+
"TextExtractionError",
|
|
20
|
+
"TextExtractionFailure",
|
|
21
|
+
"TrafilaturaExtractionException",
|
|
22
|
+
"UrlIsNotValidException",
|
|
23
|
+
"app",
|
|
24
|
+
"create_extractor_service",
|
|
25
|
+
"is_blank_string",
|
|
26
|
+
"is_valid_url",
|
|
27
|
+
]
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Abstract base classes and interfaces for the py_web_text_extractor library.
|
|
2
|
+
|
|
3
|
+
This module contains the foundational abstract classes that define the
|
|
4
|
+
contracts and interfaces for all text extraction services in the library.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from py_web_text_extractor.abstract.extractor import Extractor
|
|
8
|
+
|
|
9
|
+
__all__ = ["Extractor"]
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Abstract base classes for text extraction services.
|
|
2
|
+
|
|
3
|
+
This module defines the interface that all text extraction services must implement,
|
|
4
|
+
ensuring consistency across different extraction implementations. The Extractor
|
|
5
|
+
abstract base class serves as the foundation for all concrete extraction services,
|
|
6
|
+
enforcing a uniform API and behavior contract.
|
|
7
|
+
|
|
8
|
+
All text extraction services in the py_web_text_extractor library must inherit
|
|
9
|
+
from the Extractor ABC and implement its abstract methods to ensure compatibility
|
|
10
|
+
with the library's architecture and fallback strategies.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from abc import ABC, abstractmethod
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Extractor(ABC):
|
|
17
|
+
"""Abstract base class for text extraction services.
|
|
18
|
+
|
|
19
|
+
All text extraction services must implement these methods to provide
|
|
20
|
+
a consistent interface for extracting text content from web pages.
|
|
21
|
+
|
|
22
|
+
This ABC defines two core methods:
|
|
23
|
+
- extract_text_from_page(): Main extraction with exception handling
|
|
24
|
+
- extract_text_from_page_safe(): Safe extraction that returns empty string on error
|
|
25
|
+
|
|
26
|
+
Implementations should follow these guidelines:
|
|
27
|
+
- Provide comprehensive error handling and logging
|
|
28
|
+
- Support both strict (exception-raising) and safe (error-suppressing) modes
|
|
29
|
+
- Maintain consistent return types and exception patterns
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def extract_text_from_page_safe(self, url: str) -> str:
|
|
34
|
+
r"""Extract text content from a web page with safe error handling.
|
|
35
|
+
|
|
36
|
+
This method should implement extraction logic that catches all exceptions
|
|
37
|
+
and returns an empty string on failure, making it suitable for use cases
|
|
38
|
+
where extraction failures should not disrupt application flow.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
url: The URL of the web page to extract text from. Should be a
|
|
42
|
+
valid HTTP/HTTPS URL string.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
The extracted text content as a string if successful.
|
|
46
|
+
An empty string if any error occurs during extraction.
|
|
47
|
+
|
|
48
|
+
Example:
|
|
49
|
+
>>> extractor.extract_text_from_page_safe("https://example.com")
|
|
50
|
+
"Example Domain\\n\\nThis domain is for use in illustrative examples..."
|
|
51
|
+
|
|
52
|
+
>>> extractor.extract_text_from_page_safe("invalid-url")
|
|
53
|
+
"" # Returns empty string instead of raising exception
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
@abstractmethod
|
|
57
|
+
def extract_text_from_page(self, url: str) -> str:
|
|
58
|
+
r"""Extract text content from a web page.
|
|
59
|
+
|
|
60
|
+
This method should implement the main extraction logic with comprehensive
|
|
61
|
+
error handling that raises appropriate exceptions for different failure
|
|
62
|
+
scenarios. It's suitable for use cases where detailed error information
|
|
63
|
+
is needed.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
url: The URL of the web page to extract text from. Must be a
|
|
67
|
+
valid HTTP/HTTPS URL string.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
The extracted text content as a string.
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
TextExtractionError: If text extraction fails.
|
|
74
|
+
UrlIsNotValidException: If the provided URL is invalid or malformed.
|
|
75
|
+
MarkItDownExtractionException: If MarkItDown extraction specifically fails.
|
|
76
|
+
TrafilaturaExtractionException: If Trafilatura extraction specifically fails.
|
|
77
|
+
TextExtractionFailure: If all extraction methods fail.
|
|
78
|
+
|
|
79
|
+
Example:
|
|
80
|
+
>>> extractor.extract_text_from_page("https://example.com")
|
|
81
|
+
"Example Domain\\n\\nThis domain is for use in illustrative examples..."
|
|
82
|
+
|
|
83
|
+
>>> extractor.extract_text_from_page("invalid-url")
|
|
84
|
+
# Raises UrlIsNotValidException
|
|
85
|
+
"""
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Command-line interface for web text extraction."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
|
|
8
|
+
from py_web_text_extractor.exception.exceptions import (
|
|
9
|
+
TextExtractionError,
|
|
10
|
+
UrlIsNotValidException,
|
|
11
|
+
)
|
|
12
|
+
from py_web_text_extractor.service.extractor_service import ExtractorService
|
|
13
|
+
|
|
14
|
+
app = typer.Typer(
|
|
15
|
+
name="py-web-text-extractor",
|
|
16
|
+
help="Extract clean text content from web pages",
|
|
17
|
+
add_completion=False,
|
|
18
|
+
no_args_is_help=True,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _setup_logging(verbose: bool = False) -> None:
|
|
23
|
+
"""Configure CLI logging level.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
verbose: Enable DEBUG level logging when True; WARNING level when False.
|
|
27
|
+
"""
|
|
28
|
+
level = logging.DEBUG if verbose else logging.WARNING
|
|
29
|
+
logging.basicConfig(
|
|
30
|
+
level=level,
|
|
31
|
+
format="%(levelname)s: %(message)s",
|
|
32
|
+
stream=sys.stderr,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@app.command()
|
|
37
|
+
def main(
|
|
38
|
+
url: str,
|
|
39
|
+
safe: bool = False,
|
|
40
|
+
verbose: bool = False,
|
|
41
|
+
) -> None:
|
|
42
|
+
"""Extract text from a web page.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
url: HTTP/HTTPS URL to extract text from.
|
|
46
|
+
safe: Return empty string on error instead of exiting with failure code.
|
|
47
|
+
verbose: Enable debug logging for troubleshooting.
|
|
48
|
+
|
|
49
|
+
Exit codes:
|
|
50
|
+
0: Success (text extracted)
|
|
51
|
+
1: No text content found
|
|
52
|
+
2: Invalid URL
|
|
53
|
+
3: Text extraction failed
|
|
54
|
+
4: Unexpected error
|
|
55
|
+
"""
|
|
56
|
+
_setup_logging(verbose)
|
|
57
|
+
|
|
58
|
+
logger = logging.getLogger(__name__)
|
|
59
|
+
logger.debug("Starting extraction for URL: %s", url)
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
service = ExtractorService()
|
|
63
|
+
text = service.extract_text_from_page_safe(url) if safe else service.extract_text_from_page(url)
|
|
64
|
+
|
|
65
|
+
if text:
|
|
66
|
+
print(text)
|
|
67
|
+
sys.exit(0)
|
|
68
|
+
else:
|
|
69
|
+
print("No text content found", file=sys.stderr)
|
|
70
|
+
sys.exit(1)
|
|
71
|
+
|
|
72
|
+
except UrlIsNotValidException as e:
|
|
73
|
+
print(f"Error: Invalid URL - {e}", file=sys.stderr)
|
|
74
|
+
sys.exit(2)
|
|
75
|
+
except TextExtractionError as e:
|
|
76
|
+
print(f"Error: Text extraction failed - {e}", file=sys.stderr)
|
|
77
|
+
sys.exit(3)
|
|
78
|
+
except Exception as e:
|
|
79
|
+
print(f"Error: Unexpected error - {e}", file=sys.stderr)
|
|
80
|
+
sys.exit(4)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
if __name__ == "__main__":
|
|
84
|
+
app()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Exception classes for the py_web_text_extractor library.
|
|
2
|
+
|
|
3
|
+
This module exports all custom exception classes used throughout the text
|
|
4
|
+
extraction process, providing a consistent error handling mechanism.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from py_web_text_extractor.exception.exceptions import (
|
|
8
|
+
MarkItDownExtractionException,
|
|
9
|
+
TextExtractionError,
|
|
10
|
+
TextExtractionFailure,
|
|
11
|
+
TrafilaturaExtractionException,
|
|
12
|
+
UrlIsNotValidException,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"MarkItDownExtractionException",
|
|
17
|
+
"TextExtractionError",
|
|
18
|
+
"TextExtractionFailure",
|
|
19
|
+
"TrafilaturaExtractionException",
|
|
20
|
+
"UrlIsNotValidException",
|
|
21
|
+
]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Custom exceptions for text extraction failures."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class TextExtractionError(Exception):
|
|
5
|
+
"""Base exception for all text extraction errors."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class UrlIsNotValidException(TextExtractionError):
|
|
9
|
+
"""Invalid or malformed URL provided."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MarkItDownExtractionException(TextExtractionError):
|
|
13
|
+
"""MarkItDown extraction failed."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TrafilaturaExtractionException(TextExtractionError):
|
|
17
|
+
"""Trafilatura extraction failed."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TextExtractionFailure(TextExtractionError):
|
|
21
|
+
"""All extraction methods failed for a URL."""
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Text extraction service factory and entry points."""
|
|
2
|
+
|
|
3
|
+
from py_web_text_extractor.cli import app
|
|
4
|
+
from py_web_text_extractor.service.extractor_service import ExtractorService
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def create_extractor_service() -> ExtractorService:
|
|
8
|
+
"""Create a new text extraction service instance.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
Configured ExtractorService ready for text extraction.
|
|
12
|
+
|
|
13
|
+
Examples:
|
|
14
|
+
>>> from py_web_text_extractor import create_extractor_service
|
|
15
|
+
>>> service = create_extractor_service()
|
|
16
|
+
>>> text = service.extract_text_from_page("https://example.com")
|
|
17
|
+
>>> len(text) > 0
|
|
18
|
+
True
|
|
19
|
+
"""
|
|
20
|
+
return ExtractorService()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Convenience alias for the main service class
|
|
24
|
+
Extractor = ExtractorService
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
__all__ = ["Extractor", "ExtractorService", "app", "create_extractor_service"]
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Text extraction services for the py_web_text_extractor library.
|
|
2
|
+
|
|
3
|
+
This module contains the core service implementations for web text extraction,
|
|
4
|
+
including the main ExtractorService with fallback strategy and individual
|
|
5
|
+
extractor implementations for different libraries.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from py_web_text_extractor.service.extractor_service import ExtractorService
|
|
9
|
+
from py_web_text_extractor.service.markitdown_extractor import extract_text as markitdown_extract
|
|
10
|
+
from py_web_text_extractor.service.trafilatura_extractor import extract_text as trafilatura_extract
|
|
11
|
+
|
|
12
|
+
__all__ = ["ExtractorService", "markitdown_extract", "trafilatura_extract"]
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Web text extraction service with fallback strategy.
|
|
2
|
+
|
|
3
|
+
Provides a unified interface for extracting clean text content from web pages
|
|
4
|
+
using MarkItDown (primary) and Trafilatura (fallback) extraction methods.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from typing import override
|
|
9
|
+
|
|
10
|
+
import py_web_text_extractor.service.markitdown_extractor as mk_extractor
|
|
11
|
+
import py_web_text_extractor.service.trafilatura_extractor as tr_extractor
|
|
12
|
+
from py_web_text_extractor.abstract.extractor import Extractor
|
|
13
|
+
from py_web_text_extractor.exception.exceptions import (
|
|
14
|
+
MarkItDownExtractionException,
|
|
15
|
+
TextExtractionFailure,
|
|
16
|
+
TrafilaturaExtractionException,
|
|
17
|
+
UrlIsNotValidException,
|
|
18
|
+
)
|
|
19
|
+
from py_web_text_extractor.tools.validation import is_blank_string, is_valid_url
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ExtractorService(Extractor):
|
|
25
|
+
"""Text extraction service with MarkItDown/Trafilatura fallback strategy."""
|
|
26
|
+
|
|
27
|
+
@override
|
|
28
|
+
def extract_text_from_page(self, url: str) -> str:
|
|
29
|
+
"""Extract text content from a web page.
|
|
30
|
+
|
|
31
|
+
Attempts extraction using MarkItDown first, falling back to Trafilatura
|
|
32
|
+
if the primary method fails. Raises an exception if both methods fail.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
url: HTTP/HTTPS URL to extract text from. Must be a non-empty string
|
|
36
|
+
starting with http:// or https://.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Cleaned text content from the web page.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
UrlIsNotValidException: If url is None, empty, or not a valid HTTP/HTTPS URL.
|
|
43
|
+
TextExtractionFailure: If both extraction methods fail.
|
|
44
|
+
|
|
45
|
+
Examples:
|
|
46
|
+
>>> service = ExtractorService()
|
|
47
|
+
>>> text = service.extract_text_from_page("https://example.com")
|
|
48
|
+
>>> len(text) > 0
|
|
49
|
+
True
|
|
50
|
+
"""
|
|
51
|
+
if not isinstance(url, str):
|
|
52
|
+
logger.debug("Non-string URL provided: %s", url)
|
|
53
|
+
raise UrlIsNotValidException(f"URL must be a string, got {type(url).__name__}")
|
|
54
|
+
|
|
55
|
+
if is_blank_string(url):
|
|
56
|
+
logger.debug("Empty or blank URL provided")
|
|
57
|
+
raise UrlIsNotValidException("URL cannot be empty or blank")
|
|
58
|
+
|
|
59
|
+
if not is_valid_url(url):
|
|
60
|
+
logger.debug("Invalid URL provided: %s", url)
|
|
61
|
+
raise UrlIsNotValidException(f"Invalid URL: {url}")
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
logger.debug("Attempting to extract text from %s using MarkItDown", url)
|
|
65
|
+
return mk_extractor.extract_text(url)
|
|
66
|
+
except MarkItDownExtractionException as e:
|
|
67
|
+
logger.info("MarkItDown extraction failed for %s: %s. Falling back to Trafilatura", url, e)
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
logger.debug("Attempting to extract text from %s using Trafilatura", url)
|
|
71
|
+
return tr_extractor.extract_text(url)
|
|
72
|
+
except TrafilaturaExtractionException as e:
|
|
73
|
+
logger.warning("Trafilatura extraction failed for %s: %s", url, e)
|
|
74
|
+
|
|
75
|
+
error_msg = f"Failed to extract text from {url} using both MarkItDown and Trafilatura"
|
|
76
|
+
logger.error(error_msg)
|
|
77
|
+
raise TextExtractionFailure(error_msg)
|
|
78
|
+
|
|
79
|
+
@override
|
|
80
|
+
def extract_text_from_page_safe(self, url: str) -> str:
|
|
81
|
+
"""Extract text content with graceful error handling.
|
|
82
|
+
|
|
83
|
+
Returns empty string on any failure instead of raising exceptions.
|
|
84
|
+
Suitable for batch processing where individual failures should not
|
|
85
|
+
interrupt the overall workflow.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
url: URL to extract text from (any value accepted).
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Extracted text if successful, empty string otherwise.
|
|
92
|
+
|
|
93
|
+
Examples:
|
|
94
|
+
>>> service = ExtractorService()
|
|
95
|
+
>>> text = service.extract_text_from_page_safe("https://example.com")
|
|
96
|
+
>>> isinstance(text, str)
|
|
97
|
+
True
|
|
98
|
+
|
|
99
|
+
>>> service.extract_text_from_page_safe("invalid-url")
|
|
100
|
+
''
|
|
101
|
+
"""
|
|
102
|
+
try:
|
|
103
|
+
return self.extract_text_from_page(url)
|
|
104
|
+
except UrlIsNotValidException as e:
|
|
105
|
+
logger.warning("Invalid URL provided: %s", e)
|
|
106
|
+
return ""
|
|
107
|
+
except TextExtractionFailure as e:
|
|
108
|
+
logger.warning("Text extraction failed: %s", e)
|
|
109
|
+
return ""
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logger.warning("Unexpected error during text extraction: %s", e)
|
|
112
|
+
return ""
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""MarkItDown text extraction module.
|
|
2
|
+
|
|
3
|
+
Provides text extraction from web pages using the MarkItDown library.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
from markitdown import MarkItDown
|
|
9
|
+
|
|
10
|
+
from py_web_text_extractor.exception.exceptions import MarkItDownExtractionException
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def extract_text(url: str) -> str:
|
|
16
|
+
r"""Extract text content from a web page using MarkItDown.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
url: HTTP/HTTPS URL to extract text from.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Extracted text content from the web page.
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
MarkItDownExtractionException: If extraction fails due to network issues,
|
|
26
|
+
invalid HTML, or other MarkItDown processing errors.
|
|
27
|
+
|
|
28
|
+
Examples:
|
|
29
|
+
>>> extract_text("https://example.com")
|
|
30
|
+
"Example Domain\\n\\nThis domain is for use in illustrative examples..."
|
|
31
|
+
|
|
32
|
+
>>> extract_text("https://news.example.com/article")
|
|
33
|
+
"Article Title\\n\\nThe main content of the article..."
|
|
34
|
+
"""
|
|
35
|
+
logger.debug("Starting MarkItDown extraction for URL: %s", url)
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
md = MarkItDown()
|
|
39
|
+
text = md.convert(url)
|
|
40
|
+
extracted_text = text.text_content
|
|
41
|
+
logger.info("Successfully extracted text from %s using MarkItDown", url)
|
|
42
|
+
return extracted_text
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.warning("MarkItDown extraction failed for %s: %s", url, e)
|
|
45
|
+
raise MarkItDownExtractionException(f"MarkItDown extraction failed for {url}: {e!s}") from e
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Trafilatura text extraction module.
|
|
2
|
+
|
|
3
|
+
Provides text extraction from web pages using the Trafilatura library.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
from trafilatura import extract, fetch_url
|
|
9
|
+
|
|
10
|
+
from py_web_text_extractor.exception.exceptions import TrafilaturaExtractionException
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def extract_text(url: str) -> str:
|
|
16
|
+
r"""Extract text content from a web page using Trafilatura.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
url: HTTP/HTTPS URL to extract text from.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Extracted text content in markdown format. Returns an empty string if
|
|
23
|
+
Trafilatura finds no extractable content.
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
TrafilaturaExtractionException: If content cannot be fetched or processed.
|
|
27
|
+
|
|
28
|
+
Examples:
|
|
29
|
+
>>> extract_text("https://example.com")
|
|
30
|
+
"# Example Domain\\n\\nThis domain is for use in illustrative examples..."
|
|
31
|
+
|
|
32
|
+
>>> extract_text("https://blog.example.com/post")
|
|
33
|
+
"# Post Title\\n\\n## Section Header\\n\\nMain content..."
|
|
34
|
+
"""
|
|
35
|
+
logger.debug("Starting Trafilatura extraction for URL: %s", url)
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
content = fetch_url(url)
|
|
39
|
+
if content is None:
|
|
40
|
+
logger.warning("Failed to fetch content from %s using Trafilatura", url)
|
|
41
|
+
raise TrafilaturaExtractionException(f"Failed to fetch content from {url}")
|
|
42
|
+
|
|
43
|
+
text = extract(content, output_format="markdown")
|
|
44
|
+
extracted_text = text or ""
|
|
45
|
+
|
|
46
|
+
if extracted_text:
|
|
47
|
+
logger.info("Successfully extracted text from %s using Trafilatura", url)
|
|
48
|
+
else:
|
|
49
|
+
logger.debug("No text content found for %s using Trafilatura", url)
|
|
50
|
+
|
|
51
|
+
return extracted_text
|
|
52
|
+
except Exception as e:
|
|
53
|
+
logger.warning("Trafilatura extraction failed for %s: %s", url, e)
|
|
54
|
+
raise TrafilaturaExtractionException(f"Trafilatura extraction failed for {url}: {e!s}") from e
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Utility functions and tools for the py_web_text_extractor library.
|
|
2
|
+
|
|
3
|
+
This module provides helper functions and utilities used throughout the text
|
|
4
|
+
extraction process, including validation, formatting, and other supporting
|
|
5
|
+
functionality.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from py_web_text_extractor.tools.validation import is_blank_string, is_valid_url
|
|
9
|
+
|
|
10
|
+
__all__ = ["is_blank_string", "is_valid_url"]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Input validation utilities."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def is_blank_string(value: str | None) -> bool:
|
|
8
|
+
"""Check if a string is None, empty, or whitespace-only.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
value: String to check. May be None.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
True if value is None or contains no non-whitespace characters.
|
|
15
|
+
|
|
16
|
+
Examples:
|
|
17
|
+
>>> is_blank_string(None)
|
|
18
|
+
True
|
|
19
|
+
>>> is_blank_string("")
|
|
20
|
+
True
|
|
21
|
+
>>> is_blank_string(" ")
|
|
22
|
+
True
|
|
23
|
+
>>> is_blank_string("hello")
|
|
24
|
+
False
|
|
25
|
+
"""
|
|
26
|
+
return value is None or len(value.strip()) == 0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def is_valid_url(value: str | None) -> bool:
|
|
30
|
+
"""Validate HTTP/HTTPS URL format.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
value: URL string to validate. Must start with http:// or https://
|
|
34
|
+
and contain no whitespace characters.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
True if value is a properly formatted HTTP/HTTPS URL with scheme and netloc.
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
>>> is_valid_url("https://example.com")
|
|
41
|
+
True
|
|
42
|
+
>>> is_valid_url("http://sub.example.com/path")
|
|
43
|
+
True
|
|
44
|
+
>>> is_blank_string("example.com")
|
|
45
|
+
True
|
|
46
|
+
>>> is_valid_url("")
|
|
47
|
+
False
|
|
48
|
+
>>> is_valid_url(None)
|
|
49
|
+
False
|
|
50
|
+
>>> is_valid_url("ftp://example.com")
|
|
51
|
+
False
|
|
52
|
+
"""
|
|
53
|
+
if value is None:
|
|
54
|
+
return False
|
|
55
|
+
if not isinstance(value, str):
|
|
56
|
+
return False
|
|
57
|
+
if is_blank_string(value) or re.search(r"\s", value):
|
|
58
|
+
return False
|
|
59
|
+
if not (value.startswith("http://") or value.startswith("https://")):
|
|
60
|
+
return False
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
result = urlparse(value)
|
|
64
|
+
return bool(result.scheme and result.netloc)
|
|
65
|
+
except ValueError:
|
|
66
|
+
return False
|