feedtrail 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- feedtrail-0.1.0/AUTHORS.md +10 -0
- feedtrail-0.1.0/HISTORY.md +7 -0
- feedtrail-0.1.0/MANIFEST.in +10 -0
- feedtrail-0.1.0/PKG-INFO +169 -0
- feedtrail-0.1.0/README.md +129 -0
- feedtrail-0.1.0/feedtrail/__init__.py +5 -0
- feedtrail-0.1.0/feedtrail/feed_parser.py +483 -0
- feedtrail-0.1.0/feedtrail/utils/__init__.py +1 -0
- feedtrail-0.1.0/feedtrail/utils/date_utils.py +154 -0
- feedtrail-0.1.0/feedtrail/utils/xml_utils.py +156 -0
- feedtrail-0.1.0/feedtrail.egg-info/PKG-INFO +169 -0
- feedtrail-0.1.0/feedtrail.egg-info/SOURCES.txt +22 -0
- feedtrail-0.1.0/feedtrail.egg-info/dependency_links.txt +1 -0
- feedtrail-0.1.0/feedtrail.egg-info/not-zip-safe +1 -0
- feedtrail-0.1.0/feedtrail.egg-info/requires.txt +1 -0
- feedtrail-0.1.0/feedtrail.egg-info/top_level.txt +1 -0
- feedtrail-0.1.0/requirements.txt +1 -0
- feedtrail-0.1.0/setup.cfg +26 -0
- feedtrail-0.1.0/setup.py +40 -0
- feedtrail-0.1.0/tests/__init__.py +1 -0
- feedtrail-0.1.0/tests/test_feed_parser.py +272 -0
- feedtrail-0.1.0/tests/test_utils_date_utils.py +80 -0
- feedtrail-0.1.0/tests/test_utils_xml_utils.py +55 -0
feedtrail-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: feedtrail
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Feed Tracking and Retrieval Abstraction Interface Layer
|
|
5
|
+
Home-page: https://github.com/juanmcristobal/feedtrail
|
|
6
|
+
Author: Juan Manuel Cristóbal Moreno
|
|
7
|
+
Author-email: juanmcristobal@gmail.com
|
|
8
|
+
Keywords: feedtrail
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Natural Language :: English
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: AUTHORS.md
|
|
19
|
+
Requires-Dist: python-dateutil>=2.8.2
|
|
20
|
+
Dynamic: author
|
|
21
|
+
Dynamic: author-email
|
|
22
|
+
Dynamic: classifier
|
|
23
|
+
Dynamic: description
|
|
24
|
+
Dynamic: description-content-type
|
|
25
|
+
Dynamic: home-page
|
|
26
|
+
Dynamic: keywords
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
Dynamic: requires-dist
|
|
29
|
+
Dynamic: requires-python
|
|
30
|
+
Dynamic: summary
|
|
31
|
+
|
|
32
|
+
# feedtrail
|
|
33
|
+
|
|
34
|
+
Feed Tracking and Retrieval Abstraction Interface Layer.
|
|
35
|
+
|
|
36
|
+
`feedtrail` provides a resilient RSS/Atom parser focused on production-style feeds where XML can be noisy, partially malformed, or inconsistent across publishers.
|
|
37
|
+
|
|
38
|
+
## What It Does
|
|
39
|
+
|
|
40
|
+
- Parses RSS and Atom feeds with namespace support.
|
|
41
|
+
- Normalizes and cleans feed/item text content.
|
|
42
|
+
- Converts heterogeneous date formats to UTC ISO strings.
|
|
43
|
+
- Extracts structured metadata: title, link, description, summary, author, categories, and primary image.
|
|
44
|
+
- Computes a deterministic `request_hash` for parsed payload integrity checks.
|
|
45
|
+
- Handles malformed XML defensively (entity sanitization, escaped CDATA recovery, trailing content trimming).
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
### Runtime
|
|
50
|
+
|
|
51
|
+
Requirements:
|
|
52
|
+
- Python `>= 3.10`
|
|
53
|
+
|
|
54
|
+
Install from source:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install .
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Development
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install -r requirements_dev.txt
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Quick Start
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from feedtrail.feed_parser import FeedParser
|
|
70
|
+
|
|
71
|
+
xml_content = """<?xml version="1.0" encoding="UTF-8"?>
|
|
72
|
+
<rss version="2.0">
|
|
73
|
+
<channel>
|
|
74
|
+
<title>Example Feed</title>
|
|
75
|
+
<link>https://example.com</link>
|
|
76
|
+
<description>Demo</description>
|
|
77
|
+
<item>
|
|
78
|
+
<title>Hello</title>
|
|
79
|
+
<link>/hello</link>
|
|
80
|
+
<pubDate>Wed, 20 Mar 2024 09:00:00 GMT</pubDate>
|
|
81
|
+
<description><![CDATA[Post body]]></description>
|
|
82
|
+
</item>
|
|
83
|
+
</channel>
|
|
84
|
+
</rss>"""
|
|
85
|
+
|
|
86
|
+
parser = FeedParser()
|
|
87
|
+
result = parser.parse(xml_content, base_url="https://example.com")
|
|
88
|
+
|
|
89
|
+
print(result["headers"]["title"])
|
|
90
|
+
print(result["items"][0]["link"])
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Output Contract
|
|
94
|
+
|
|
95
|
+
`FeedParser.parse(...)` returns a dictionary with:
|
|
96
|
+
|
|
97
|
+
- `headers`: feed-level metadata (`title`, `link`, `description`, `updated`, `language`, `generator`, `parent_link`, `self_link`).
|
|
98
|
+
- `items`: list of normalized entries, each including:
|
|
99
|
+
- `title`
|
|
100
|
+
- `link`
|
|
101
|
+
- `description`
|
|
102
|
+
- `summary`
|
|
103
|
+
- `pub_date` (ISO-like UTC string, when available)
|
|
104
|
+
- `author`
|
|
105
|
+
- `image`
|
|
106
|
+
- `categories`
|
|
107
|
+
- `request_hash`: SHA-256 hash of normalized parsed payload.
|
|
108
|
+
- `error`: present when parsing fails (`items` will be empty in that case).
|
|
109
|
+
|
|
110
|
+
## Development Workflow
|
|
111
|
+
|
|
112
|
+
### Run tests
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
make test
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Lint and formatting
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
make lint
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Coverage
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
make coverage
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Build Docker test image
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
make build-test-image
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### Run tox environments in Docker
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
make test-all
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Project Structure
|
|
143
|
+
|
|
144
|
+
```text
|
|
145
|
+
feedtrail/
|
|
146
|
+
feed_parser.py # Core RSS/Atom parser
|
|
147
|
+
utils/
|
|
148
|
+
date_utils.py # Date parsing and normalization
|
|
149
|
+
xml_utils.py # XML sanitation and extraction helpers
|
|
150
|
+
tests/
|
|
151
|
+
test_feed_parser.py
|
|
152
|
+
test_utils_date_utils.py
|
|
153
|
+
test_utils_xml_utils.py
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Author
|
|
157
|
+
|
|
158
|
+
- Juan Manuel Cristóbal Moreno (<juanmcristobal@gmail.com>)
|
|
159
|
+
|
|
160
|
+
See [AUTHORS.md](AUTHORS.md) for contributors.
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# History
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
## 0.1.0 (2026-03-25)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
* First release.
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# feedtrail
|
|
2
|
+
|
|
3
|
+
Feed Tracking and Retrieval Abstraction Interface Layer.
|
|
4
|
+
|
|
5
|
+
`feedtrail` provides a resilient RSS/Atom parser focused on production-style feeds where XML can be noisy, partially malformed, or inconsistent across publishers.
|
|
6
|
+
|
|
7
|
+
## What It Does
|
|
8
|
+
|
|
9
|
+
- Parses RSS and Atom feeds with namespace support.
|
|
10
|
+
- Normalizes and cleans feed/item text content.
|
|
11
|
+
- Converts heterogeneous date formats to UTC ISO strings.
|
|
12
|
+
- Extracts structured metadata: title, link, description, summary, author, categories, and primary image.
|
|
13
|
+
- Computes a deterministic `request_hash` for parsed payload integrity checks.
|
|
14
|
+
- Handles malformed XML defensively (entity sanitization, escaped CDATA recovery, trailing content trimming).
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
### Runtime
|
|
19
|
+
|
|
20
|
+
Requirements:
|
|
21
|
+
- Python `>= 3.10`
|
|
22
|
+
|
|
23
|
+
Install from source:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install .
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Development
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install -r requirements_dev.txt
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from feedtrail.feed_parser import FeedParser
|
|
39
|
+
|
|
40
|
+
xml_content = """<?xml version="1.0" encoding="UTF-8"?>
|
|
41
|
+
<rss version="2.0">
|
|
42
|
+
<channel>
|
|
43
|
+
<title>Example Feed</title>
|
|
44
|
+
<link>https://example.com</link>
|
|
45
|
+
<description>Demo</description>
|
|
46
|
+
<item>
|
|
47
|
+
<title>Hello</title>
|
|
48
|
+
<link>/hello</link>
|
|
49
|
+
<pubDate>Wed, 20 Mar 2024 09:00:00 GMT</pubDate>
|
|
50
|
+
<description><![CDATA[Post body]]></description>
|
|
51
|
+
</item>
|
|
52
|
+
</channel>
|
|
53
|
+
</rss>"""
|
|
54
|
+
|
|
55
|
+
parser = FeedParser()
|
|
56
|
+
result = parser.parse(xml_content, base_url="https://example.com")
|
|
57
|
+
|
|
58
|
+
print(result["headers"]["title"])
|
|
59
|
+
print(result["items"][0]["link"])
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Output Contract
|
|
63
|
+
|
|
64
|
+
`FeedParser.parse(...)` returns a dictionary with:
|
|
65
|
+
|
|
66
|
+
- `headers`: feed-level metadata (`title`, `link`, `description`, `updated`, `language`, `generator`, `parent_link`, `self_link`).
|
|
67
|
+
- `items`: list of normalized entries, each including:
|
|
68
|
+
- `title`
|
|
69
|
+
- `link`
|
|
70
|
+
- `description`
|
|
71
|
+
- `summary`
|
|
72
|
+
- `pub_date` (ISO-like UTC string, when available)
|
|
73
|
+
- `author`
|
|
74
|
+
- `image`
|
|
75
|
+
- `categories`
|
|
76
|
+
- `request_hash`: SHA-256 hash of normalized parsed payload.
|
|
77
|
+
- `error`: present when parsing fails (`items` will be empty in that case).
|
|
78
|
+
|
|
79
|
+
## Development Workflow
|
|
80
|
+
|
|
81
|
+
### Run tests
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
make test
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Lint and formatting
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
make lint
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Coverage
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
make coverage
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Build Docker test image
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
make build-test-image
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Run tox environments in Docker
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
make test-all
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Project Structure
|
|
112
|
+
|
|
113
|
+
```text
|
|
114
|
+
feedtrail/
|
|
115
|
+
feed_parser.py # Core RSS/Atom parser
|
|
116
|
+
utils/
|
|
117
|
+
date_utils.py # Date parsing and normalization
|
|
118
|
+
xml_utils.py # XML sanitation and extraction helpers
|
|
119
|
+
tests/
|
|
120
|
+
test_feed_parser.py
|
|
121
|
+
test_utils_date_utils.py
|
|
122
|
+
test_utils_xml_utils.py
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Author
|
|
126
|
+
|
|
127
|
+
- Juan Manuel Cristóbal Moreno (<juanmcristobal@gmail.com>)
|
|
128
|
+
|
|
129
|
+
See [AUTHORS.md](AUTHORS.md) for contributors.
|