fetchxml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fetchxml/__init__.py +4 -0
- fetchxml/client.py +70 -0
- fetchxml/exceptions.py +3 -0
- fetchxml-0.1.0.dist-info/METADATA +356 -0
- fetchxml-0.1.0.dist-info/RECORD +8 -0
- fetchxml-0.1.0.dist-info/WHEEL +5 -0
- fetchxml-0.1.0.dist-info/licenses/LICENSE +21 -0
- fetchxml-0.1.0.dist-info/top_level.txt +1 -0
fetchxml/__init__.py
ADDED
fetchxml/client.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import time
|
|
3
|
+
from .exceptions import FetchXMLError
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FetchXML:
|
|
7
|
+
def __init__(self, base_url=None, delay=0.5, timeout=15):
|
|
8
|
+
self.base_url = base_url
|
|
9
|
+
self.delay = delay
|
|
10
|
+
self.timeout = timeout
|
|
11
|
+
self.session = requests.Session()
|
|
12
|
+
self._init_headers()
|
|
13
|
+
if base_url:
|
|
14
|
+
self._bootstrap_session()
|
|
15
|
+
|
|
16
|
+
def _init_headers(self):
|
|
17
|
+
self.session.headers.update({
|
|
18
|
+
"User-Agent": (
|
|
19
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
20
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
21
|
+
"Chrome/122.0.0.0 Safari/537.36"
|
|
22
|
+
),
|
|
23
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
24
|
+
"Connection": "keep-alive",
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
def _bootstrap_session(self):
|
|
28
|
+
try:
|
|
29
|
+
r = self.session.get(self.base_url, timeout=self.timeout)
|
|
30
|
+
if r.status_code != 200:
|
|
31
|
+
raise FetchXMLError(
|
|
32
|
+
f"Failed to initialize session. Status {r.status_code}"
|
|
33
|
+
)
|
|
34
|
+
except Exception as e:
|
|
35
|
+
raise FetchXMLError(f"Session bootstrap failed: {str(e)}")
|
|
36
|
+
|
|
37
|
+
def fetch(self, url, referer=None):
|
|
38
|
+
headers = {
|
|
39
|
+
"Accept": "application/xml,text/xml,*/*;q=0.1",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if referer:
|
|
43
|
+
headers["Referer"] = referer
|
|
44
|
+
elif self.base_url:
|
|
45
|
+
headers["Referer"] = self.base_url
|
|
46
|
+
|
|
47
|
+
time.sleep(self.delay)
|
|
48
|
+
|
|
49
|
+
response = self.session.get(
|
|
50
|
+
url,
|
|
51
|
+
headers=headers,
|
|
52
|
+
timeout=self.timeout
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
if response.status_code == 403:
|
|
56
|
+
# Attempt session refresh once
|
|
57
|
+
if self.base_url:
|
|
58
|
+
self._bootstrap_session()
|
|
59
|
+
response = self.session.get(
|
|
60
|
+
url,
|
|
61
|
+
headers=headers,
|
|
62
|
+
timeout=self.timeout
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if response.status_code != 200:
|
|
66
|
+
raise FetchXMLError(
|
|
67
|
+
f"Failed to fetch XML. Status {response.status_code}"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return response.text
|
fetchxml/exceptions.py
ADDED
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fetchxml
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight session-based XML fetcher with browser-like behavior.
|
|
5
|
+
Author: Your Name
|
|
6
|
+
Project-URL: Homepage, https://github.com/yourusername/fetchxml
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: requests
|
|
11
|
+
Dynamic: license-file
|
|
12
|
+
|
|
13
|
+
Here is a complete, production-quality **README.md** for your open-source package **fetchxml**.
|
|
14
|
+
|
|
15
|
+
You can paste this directly into `README.md`.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
# ๐ฆ fetchxml
|
|
20
|
+
|
|
21
|
+
Lightweight, session-based XML fetcher for Python.
|
|
22
|
+
|
|
23
|
+
`fetchxml` provides a clean, reusable way to fetch XML from web endpoints that require:
|
|
24
|
+
|
|
25
|
+
* Browser-like headers
|
|
26
|
+
* Session initialization
|
|
27
|
+
* Cookie handling
|
|
28
|
+
* Referer validation
|
|
29
|
+
* Basic anti-bot protection handling
|
|
30
|
+
|
|
31
|
+
It abstracts session bootstrapping and retry logic into a simple interface.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## ๐ Why fetchxml?
|
|
36
|
+
|
|
37
|
+
Some websites block simple HTTP requests and require:
|
|
38
|
+
|
|
39
|
+
* A session cookie
|
|
40
|
+
* Proper User-Agent
|
|
41
|
+
* Referer header
|
|
42
|
+
* Basic browser simulation
|
|
43
|
+
|
|
44
|
+
`fetchxml` handles this automatically.
|
|
45
|
+
|
|
46
|
+
Instead of writing repetitive session logic every time, you can do:
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from fetchxml import FetchXML
|
|
50
|
+
|
|
51
|
+
client = FetchXML(base_url="https://example.com")
|
|
52
|
+
xml = client.fetch("https://example.com/file.xml")
|
|
53
|
+
|
|
54
|
+
print(xml)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
# ๐ฅ Installation
|
|
60
|
+
|
|
61
|
+
## Option 1 โ Install from local project
|
|
62
|
+
|
|
63
|
+
From the project root (where `pyproject.toml` is located):
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install .
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
For development mode:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install -e .
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Option 2 โ Install from PyPI (after publishing)
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install fetchxml
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
# ๐ง Basic Usage
|
|
86
|
+
|
|
87
|
+
## 1๏ธโฃ Simple XML Fetch
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from fetchxml import FetchXML
|
|
91
|
+
|
|
92
|
+
client = FetchXML()
|
|
93
|
+
|
|
94
|
+
xml = client.fetch("https://example.com/sample.xml")
|
|
95
|
+
|
|
96
|
+
print(xml[:500])
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Use this when the target site does NOT require session bootstrap.
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## 2๏ธโฃ Fetch XML With Session Bootstrap
|
|
104
|
+
|
|
105
|
+
Some sites require hitting their homepage first to establish cookies.
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from fetchxml import FetchXML
|
|
109
|
+
|
|
110
|
+
client = FetchXML(base_url="https://example.com")
|
|
111
|
+
|
|
112
|
+
xml = client.fetch("https://example.com/sample.xml")
|
|
113
|
+
|
|
114
|
+
print(xml)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
`base_url` triggers automatic session initialization.
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## 3๏ธโฃ Fetch With Custom Referer
|
|
122
|
+
|
|
123
|
+
If a specific referer header is required:
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
xml = client.fetch(
|
|
127
|
+
"https://example.com/sample.xml",
|
|
128
|
+
referer="https://example.com/dashboard"
|
|
129
|
+
)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
# โ๏ธ Configuration Options
|
|
135
|
+
|
|
136
|
+
When initializing:
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
client = FetchXML(
|
|
140
|
+
base_url="https://example.com", # optional
|
|
141
|
+
delay=0.5, # delay between requests (seconds)
|
|
142
|
+
timeout=15 # request timeout (seconds)
|
|
143
|
+
)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Parameters
|
|
147
|
+
|
|
148
|
+
| Parameter | Description |
|
|
149
|
+
| ---------- | --------------------------------------------- |
|
|
150
|
+
| `base_url` | URL used to bootstrap session cookies |
|
|
151
|
+
| `delay` | Sleep time before each request (default 0.5s) |
|
|
152
|
+
| `timeout` | Request timeout in seconds (default 15s) |
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
# ๐ Automatic Retry Behavior
|
|
157
|
+
|
|
158
|
+
If a request returns **HTTP 403**, `fetchxml` will:
|
|
159
|
+
|
|
160
|
+
1. Attempt to re-bootstrap session (if `base_url` provided)
|
|
161
|
+
2. Retry the request once
|
|
162
|
+
|
|
163
|
+
If it still fails โ exception is raised.
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
# โ Exception Handling
|
|
168
|
+
|
|
169
|
+
All errors raise:
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
FetchXMLError
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
Import it like:
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
from fetchxml import FetchXMLError
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Example:
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
from fetchxml import FetchXML, FetchXMLError
|
|
185
|
+
|
|
186
|
+
client = FetchXML(base_url="https://example.com")
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
xml = client.fetch("https://example.com/sample.xml")
|
|
190
|
+
print(xml)
|
|
191
|
+
except FetchXMLError as e:
|
|
192
|
+
print("Failed to fetch XML:", str(e))
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
# ๐ What Triggers FetchXMLError?
|
|
198
|
+
|
|
199
|
+
* Session bootstrap failure
|
|
200
|
+
* Non-200 HTTP response
|
|
201
|
+
* Timeout
|
|
202
|
+
* Connection error
|
|
203
|
+
* Persistent 403 after retry
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
# ๐ก๏ธ Rate Limiting
|
|
208
|
+
|
|
209
|
+
`delay` ensures a pause before each request:
|
|
210
|
+
|
|
211
|
+
```python
|
|
212
|
+
client = FetchXML(delay=1.5)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
Recommended for:
|
|
216
|
+
|
|
217
|
+
* Bulk XML downloads
|
|
218
|
+
* Respecting server load
|
|
219
|
+
* Avoiding bot detection
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
# ๐ Example: Download and Save XML
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
from fetchxml import FetchXML
|
|
227
|
+
|
|
228
|
+
client = FetchXML(base_url="https://example.com")
|
|
229
|
+
|
|
230
|
+
url = "https://example.com/sample.xml"
|
|
231
|
+
xml = client.fetch(url)
|
|
232
|
+
|
|
233
|
+
with open("sample.xml", "w", encoding="utf-8") as f:
|
|
234
|
+
f.write(xml)
|
|
235
|
+
|
|
236
|
+
print("Saved successfully.")
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
# ๐ง Advanced: Reusing One Client for Multiple Files
|
|
242
|
+
|
|
243
|
+
Best practice for bulk downloads:
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
from fetchxml import FetchXML
|
|
247
|
+
|
|
248
|
+
client = FetchXML(base_url="https://example.com")
|
|
249
|
+
|
|
250
|
+
urls = [
|
|
251
|
+
"https://example.com/file1.xml",
|
|
252
|
+
"https://example.com/file2.xml",
|
|
253
|
+
"https://example.com/file3.xml"
|
|
254
|
+
]
|
|
255
|
+
|
|
256
|
+
for url in urls:
|
|
257
|
+
xml = client.fetch(url)
|
|
258
|
+
print(f"Downloaded {url}")
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
This reuses the same session and cookies.
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
# ๐งช Testing Connectivity
|
|
266
|
+
|
|
267
|
+
You can quickly test a URL:
|
|
268
|
+
|
|
269
|
+
```python
|
|
270
|
+
from fetchxml import FetchXML
|
|
271
|
+
|
|
272
|
+
client = FetchXML()
|
|
273
|
+
|
|
274
|
+
try:
|
|
275
|
+
xml = client.fetch("https://example.com/sample.xml")
|
|
276
|
+
print("Success")
|
|
277
|
+
except Exception as e:
|
|
278
|
+
print("Error:", e)
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
---
|
|
282
|
+
|
|
283
|
+
# ๐๏ธ Project Structure
|
|
284
|
+
|
|
285
|
+
```
|
|
286
|
+
fetchxml/
|
|
287
|
+
โ
|
|
288
|
+
โโโ fetchxml/
|
|
289
|
+
โ โโโ __init__.py
|
|
290
|
+
โ โโโ client.py
|
|
291
|
+
โ โโโ exceptions.py
|
|
292
|
+
โ
|
|
293
|
+
โโโ pyproject.toml
|
|
294
|
+
โโโ README.md
|
|
295
|
+
โโโ LICENSE
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
---
|
|
299
|
+
|
|
300
|
+
# ๐ License
|
|
301
|
+
|
|
302
|
+
MIT License
|
|
303
|
+
|
|
304
|
+
See `LICENSE` file for full text.
|
|
305
|
+
|
|
306
|
+
---
|
|
307
|
+
|
|
308
|
+
# โ ๏ธ Disclaimer
|
|
309
|
+
|
|
310
|
+
`fetchxml` does not bypass authentication systems or CAPTCHAs.
|
|
311
|
+
|
|
312
|
+
It simply mimics normal browser session behavior using:
|
|
313
|
+
|
|
314
|
+
* Session cookies
|
|
315
|
+
* Proper headers
|
|
316
|
+
* Referer validation
|
|
317
|
+
|
|
318
|
+
Users are responsible for complying with website terms of service.
|
|
319
|
+
|
|
320
|
+
---
|
|
321
|
+
|
|
322
|
+
# ๐ก When To Use fetchxml
|
|
323
|
+
|
|
324
|
+
Use it when:
|
|
325
|
+
|
|
326
|
+
* A site blocks naive `requests.get()`
|
|
327
|
+
* Cookies must be initialized first
|
|
328
|
+
* Referer headers are required
|
|
329
|
+
* You want clean, reusable XML fetching logic
|
|
330
|
+
|
|
331
|
+
Do NOT use it for:
|
|
332
|
+
|
|
333
|
+
* Circumventing login walls
|
|
334
|
+
* Bypassing paywalls
|
|
335
|
+
* Evading legal restrictions
|
|
336
|
+
|
|
337
|
+
---
|
|
338
|
+
|
|
339
|
+
# ๐งฉ Roadmap (Optional Future Enhancements)
|
|
340
|
+
|
|
341
|
+
* Async version
|
|
342
|
+
* Disk caching layer
|
|
343
|
+
* Proxy support
|
|
344
|
+
* Built-in XML validation
|
|
345
|
+
* Exponential backoff strategy
|
|
346
|
+
* Logging integration
|
|
347
|
+
|
|
348
|
+
---
|
|
349
|
+
|
|
350
|
+
# ๐ค Author
|
|
351
|
+
|
|
352
|
+
Saurabh Kumar Agarwal
|
|
353
|
+
2026
|
|
354
|
+
|
|
355
|
+
---
|
|
356
|
+
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
fetchxml/__init__.py,sha256=-OcUbUvt0puNz55MC0AR03gaDGVqKdOhF0FGa4kn6vc,110
|
|
2
|
+
fetchxml/client.py,sha256=77ETD2j2g-dUZldYuxsL5ZdmRl0ERffrhXX7Le8BZ2o,2139
|
|
3
|
+
fetchxml/exceptions.py,sha256=Rn_c9AeDuHqh969Vj-qykSPLInQQr4qQvclzpi4Nvrk,90
|
|
4
|
+
fetchxml-0.1.0.dist-info/licenses/LICENSE,sha256=Fe4ROp3CXeMvKjcZjK1lLTMNHrLr1htCR8DGXESgxqU,1097
|
|
5
|
+
fetchxml-0.1.0.dist-info/METADATA,sha256=jmQ6cMDKpdy66WNV0UlVXV1NgBBXtNldqYBZHZe0t9k,6460
|
|
6
|
+
fetchxml-0.1.0.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
|
|
7
|
+
fetchxml-0.1.0.dist-info/top_level.txt,sha256=BdU2_dX2Y37md0_BDfE3fp92z94jmQmOu-hDBVEQ-xc,9
|
|
8
|
+
fetchxml-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Saurabh Kumar Agarwal
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
fetchxml
|