fastfeedparser 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fastfeedparser-0.1.0/LICENSE +21 -0
- fastfeedparser-0.1.0/PKG-INFO +150 -0
- fastfeedparser-0.1.0/README.md +122 -0
- fastfeedparser-0.1.0/pyproject.toml +3 -0
- fastfeedparser-0.1.0/setup.cfg +43 -0
- fastfeedparser-0.1.0/src/fastfeedparser/__init__.py +4 -0
- fastfeedparser-0.1.0/src/fastfeedparser/main.py +676 -0
- fastfeedparser-0.1.0/src/fastfeedparser.egg-info/PKG-INFO +150 -0
- fastfeedparser-0.1.0/src/fastfeedparser.egg-info/SOURCES.txt +11 -0
- fastfeedparser-0.1.0/src/fastfeedparser.egg-info/dependency_links.txt +1 -0
- fastfeedparser-0.1.0/src/fastfeedparser.egg-info/requires.txt +4 -0
- fastfeedparser-0.1.0/src/fastfeedparser.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Kagi Search
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: fastfeedparser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: High performance RSS, Atom and RDF parser in Python
|
|
5
|
+
Home-page: https://github.com/kagi-search/fastfeedparser
|
|
6
|
+
Author: Vladimir Prelovac
|
|
7
|
+
Author-email: vlad@kagi.com
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/kagi-search/fastfeedparser/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Development Status :: 4 - Beta
|
|
18
|
+
Classifier: Intended Audience :: Developers
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
21
|
+
Requires-Python: >=3.7
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: httpx
|
|
25
|
+
Requires-Dist: lxml
|
|
26
|
+
Requires-Dist: parsedatetime
|
|
27
|
+
Requires-Dist: python-dateutil
|
|
28
|
+
|
|
29
|
+
# FastFeedParser
|
|
30
|
+
|
|
31
|
+
A high-performance RSS, Atom, and RDF feed parser for Python. FastFeedParser is designed to be fast, memory-efficient, and easy to use while providing comprehensive feed parsing capabilities.
|
|
32
|
+
|
|
33
|
+
### Why FastFeedParser?
|
|
34
|
+
|
|
35
|
+
The main advantage of FastFeedParser over the traditional feedparser library is its lightweight design and exceptional performance - benchmarks show it's 10x-100x faster than feedparser while maintaining a familiar API. This dramatic speed improvement is achieved through:
|
|
36
|
+
|
|
37
|
+
- Efficient XML parsing using lxml
|
|
38
|
+
- Optimized memory usage
|
|
39
|
+
- Minimal dependencies
|
|
40
|
+
- Streamlined codebase focused on core functionality
|
|
41
|
+
|
|
42
|
+
## Features
|
|
43
|
+
|
|
44
|
+
- Fast parsing of RSS 2.0, Atom 1.0, and RDF/RSS 1.0 feeds
|
|
45
|
+
- Robust error handling and encoding detection
|
|
46
|
+
- Support for media content and enclosures
|
|
47
|
+
- Automatic date parsing with timezone handling
|
|
48
|
+
- Clean, Pythonic API similar to feedparser
|
|
49
|
+
- Comprehensive handling of feed metadata
|
|
50
|
+
- Support for various feed extensions (Media RSS, Dublin Core, etc.)
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install fastfeedparser
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Quick Start
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
import fastfeedparser
|
|
62
|
+
|
|
63
|
+
# Parse from URL
|
|
64
|
+
feed = fastfeedparser.parse_url('https://example.com/feed.xml')
|
|
65
|
+
|
|
66
|
+
# Parse from string
|
|
67
|
+
xml_content = '''<?xml version="1.0"?>
|
|
68
|
+
<rss version="2.0">
|
|
69
|
+
<channel>
|
|
70
|
+
<title>Example Feed</title>
|
|
71
|
+
...
|
|
72
|
+
</channel>
|
|
73
|
+
</rss>'''
|
|
74
|
+
feed = fastfeedparser.parse(xml_content)
|
|
75
|
+
|
|
76
|
+
# Access feed information
|
|
77
|
+
print(feed.feed.title)
|
|
78
|
+
print(feed.feed.link)
|
|
79
|
+
|
|
80
|
+
# Access entries
|
|
81
|
+
for entry in feed.entries:
|
|
82
|
+
print(entry.title)
|
|
83
|
+
print(entry.link)
|
|
84
|
+
print(entry.published)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Key Features
|
|
88
|
+
|
|
89
|
+
### Feed Types Support
|
|
90
|
+
- RSS 2.0
|
|
91
|
+
- Atom 1.0
|
|
92
|
+
- RDF/RSS 1.0
|
|
93
|
+
|
|
94
|
+
### Content Handling
|
|
95
|
+
- Automatic encoding detection
|
|
96
|
+
- HTML content parsing
|
|
97
|
+
- Media content extraction
|
|
98
|
+
- Enclosure handling
|
|
99
|
+
|
|
100
|
+
### Metadata Support
|
|
101
|
+
- Feed title, link, and description
|
|
102
|
+
- Publication dates
|
|
103
|
+
- Author information
|
|
104
|
+
- Categories and tags
|
|
105
|
+
- Media content and thumbnails
|
|
106
|
+
|
|
107
|
+
## API Reference
|
|
108
|
+
|
|
109
|
+
### Main Functions
|
|
110
|
+
|
|
111
|
+
- `parse(xml_content)`: Parse feed from a string or bytes
|
|
112
|
+
- `parse_url(url)`: Parse feed from a URL
|
|
113
|
+
- `fetch_url(url)`: Fetch content from a URL
|
|
114
|
+
|
|
115
|
+
### Feed Object Structure
|
|
116
|
+
|
|
117
|
+
The parser returns a `FastFeedParserDict` object with two main sections:
|
|
118
|
+
|
|
119
|
+
- `feed`: Contains feed-level metadata
|
|
120
|
+
- `entries`: List of feed entries
|
|
121
|
+
|
|
122
|
+
Each entry contains:
|
|
123
|
+
- `title`: Entry title
|
|
124
|
+
- `link`: Entry URL
|
|
125
|
+
- `description`: Entry description/summary
|
|
126
|
+
- `published`: Publication date
|
|
127
|
+
- `author`: Author information
|
|
128
|
+
- `content`: Full content
|
|
129
|
+
- `media_content`: Media attachments
|
|
130
|
+
- `enclosures`: Attached files
|
|
131
|
+
|
|
132
|
+
## Requirements
|
|
133
|
+
|
|
134
|
+
- Python 3.7+
|
|
135
|
+
- httpx
|
|
136
|
+
- lxml
|
|
137
|
+
- parsedatetime
|
|
138
|
+
- python-dateutil
|
|
139
|
+
|
|
140
|
+
## Contributing
|
|
141
|
+
|
|
142
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
143
|
+
|
|
144
|
+
## License
|
|
145
|
+
|
|
146
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
147
|
+
|
|
148
|
+
## Acknowledgments
|
|
149
|
+
|
|
150
|
+
Inspired by the Universal Feed Parser (feedparser) project, FastFeedParser aims to provide a modern, high-performance alternative while maintaining a familiar API.
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# FastFeedParser
|
|
2
|
+
|
|
3
|
+
A high-performance RSS, Atom, and RDF feed parser for Python. FastFeedParser is designed to be fast, memory-efficient, and easy to use while providing comprehensive feed parsing capabilities.
|
|
4
|
+
|
|
5
|
+
### Why FastFeedParser?
|
|
6
|
+
|
|
7
|
+
The main advantage of FastFeedParser over the traditional feedparser library is its lightweight design and exceptional performance - benchmarks show it's 10x-100x faster than feedparser while maintaining a familiar API. This dramatic speed improvement is achieved through:
|
|
8
|
+
|
|
9
|
+
- Efficient XML parsing using lxml
|
|
10
|
+
- Optimized memory usage
|
|
11
|
+
- Minimal dependencies
|
|
12
|
+
- Streamlined codebase focused on core functionality
|
|
13
|
+
|
|
14
|
+
## Features
|
|
15
|
+
|
|
16
|
+
- Fast parsing of RSS 2.0, Atom 1.0, and RDF/RSS 1.0 feeds
|
|
17
|
+
- Robust error handling and encoding detection
|
|
18
|
+
- Support for media content and enclosures
|
|
19
|
+
- Automatic date parsing with timezone handling
|
|
20
|
+
- Clean, Pythonic API similar to feedparser
|
|
21
|
+
- Comprehensive handling of feed metadata
|
|
22
|
+
- Support for various feed extensions (Media RSS, Dublin Core, etc.)
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install fastfeedparser
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Quick Start
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
import fastfeedparser
|
|
34
|
+
|
|
35
|
+
# Parse from URL
|
|
36
|
+
feed = fastfeedparser.parse_url('https://example.com/feed.xml')
|
|
37
|
+
|
|
38
|
+
# Parse from string
|
|
39
|
+
xml_content = '''<?xml version="1.0"?>
|
|
40
|
+
<rss version="2.0">
|
|
41
|
+
<channel>
|
|
42
|
+
<title>Example Feed</title>
|
|
43
|
+
...
|
|
44
|
+
</channel>
|
|
45
|
+
</rss>'''
|
|
46
|
+
feed = fastfeedparser.parse(xml_content)
|
|
47
|
+
|
|
48
|
+
# Access feed information
|
|
49
|
+
print(feed.feed.title)
|
|
50
|
+
print(feed.feed.link)
|
|
51
|
+
|
|
52
|
+
# Access entries
|
|
53
|
+
for entry in feed.entries:
|
|
54
|
+
print(entry.title)
|
|
55
|
+
print(entry.link)
|
|
56
|
+
print(entry.published)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Key Features
|
|
60
|
+
|
|
61
|
+
### Feed Types Support
|
|
62
|
+
- RSS 2.0
|
|
63
|
+
- Atom 1.0
|
|
64
|
+
- RDF/RSS 1.0
|
|
65
|
+
|
|
66
|
+
### Content Handling
|
|
67
|
+
- Automatic encoding detection
|
|
68
|
+
- HTML content parsing
|
|
69
|
+
- Media content extraction
|
|
70
|
+
- Enclosure handling
|
|
71
|
+
|
|
72
|
+
### Metadata Support
|
|
73
|
+
- Feed title, link, and description
|
|
74
|
+
- Publication dates
|
|
75
|
+
- Author information
|
|
76
|
+
- Categories and tags
|
|
77
|
+
- Media content and thumbnails
|
|
78
|
+
|
|
79
|
+
## API Reference
|
|
80
|
+
|
|
81
|
+
### Main Functions
|
|
82
|
+
|
|
83
|
+
- `parse(xml_content)`: Parse feed from a string or bytes
|
|
84
|
+
- `parse_url(url)`: Parse feed from a URL
|
|
85
|
+
- `fetch_url(url)`: Fetch content from a URL
|
|
86
|
+
|
|
87
|
+
### Feed Object Structure
|
|
88
|
+
|
|
89
|
+
The parser returns a `FastFeedParserDict` object with two main sections:
|
|
90
|
+
|
|
91
|
+
- `feed`: Contains feed-level metadata
|
|
92
|
+
- `entries`: List of feed entries
|
|
93
|
+
|
|
94
|
+
Each entry contains:
|
|
95
|
+
- `title`: Entry title
|
|
96
|
+
- `link`: Entry URL
|
|
97
|
+
- `description`: Entry description/summary
|
|
98
|
+
- `published`: Publication date
|
|
99
|
+
- `author`: Author information
|
|
100
|
+
- `content`: Full content
|
|
101
|
+
- `media_content`: Media attachments
|
|
102
|
+
- `enclosures`: Attached files
|
|
103
|
+
|
|
104
|
+
## Requirements
|
|
105
|
+
|
|
106
|
+
- Python 3.7+
|
|
107
|
+
- httpx
|
|
108
|
+
- lxml
|
|
109
|
+
- parsedatetime
|
|
110
|
+
- python-dateutil
|
|
111
|
+
|
|
112
|
+
## Contributing
|
|
113
|
+
|
|
114
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
115
|
+
|
|
116
|
+
## License
|
|
117
|
+
|
|
118
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
119
|
+
|
|
120
|
+
## Acknowledgments
|
|
121
|
+
|
|
122
|
+
Inspired by the Universal Feed Parser (feedparser) project, FastFeedParser aims to provide a modern, high-performance alternative while maintaining a familiar API.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[metadata]
|
|
2
|
+
name = fastfeedparser
|
|
3
|
+
version = 0.1.0
|
|
4
|
+
author = Vladimir Prelovac
|
|
5
|
+
author_email = vlad@kagi.com
|
|
6
|
+
description = High performance RSS, Atom and RDF parser in Python
|
|
7
|
+
long_description = file: README.md
|
|
8
|
+
long_description_content_type = text/markdown
|
|
9
|
+
url = https://github.com/kagi-search/fastfeedparser
|
|
10
|
+
project_urls =
|
|
11
|
+
Bug Tracker = https://github.com/kagi-search/fastfeedparser/issues
|
|
12
|
+
classifiers =
|
|
13
|
+
Programming Language :: Python :: 3
|
|
14
|
+
Programming Language :: Python :: 3.7
|
|
15
|
+
Programming Language :: Python :: 3.8
|
|
16
|
+
Programming Language :: Python :: 3.9
|
|
17
|
+
Programming Language :: Python :: 3.10
|
|
18
|
+
Programming Language :: Python :: 3.11
|
|
19
|
+
License :: OSI Approved :: MIT License
|
|
20
|
+
Operating System :: OS Independent
|
|
21
|
+
Development Status :: 4 - Beta
|
|
22
|
+
Intended Audience :: Developers
|
|
23
|
+
Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Topic :: Text Processing :: Markup :: XML
|
|
25
|
+
|
|
26
|
+
[options]
|
|
27
|
+
package_dir =
|
|
28
|
+
= src
|
|
29
|
+
packages = find:
|
|
30
|
+
python_requires = >=3.7
|
|
31
|
+
install_requires =
|
|
32
|
+
httpx
|
|
33
|
+
lxml
|
|
34
|
+
parsedatetime
|
|
35
|
+
python-dateutil
|
|
36
|
+
|
|
37
|
+
[options.packages.find]
|
|
38
|
+
where = src
|
|
39
|
+
|
|
40
|
+
[egg_info]
|
|
41
|
+
tag_build =
|
|
42
|
+
tag_date = 0
|
|
43
|
+
|
|
@@ -0,0 +1,676 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import httpx
|
|
3
|
+
from lxml import etree
|
|
4
|
+
import parsedatetime
|
|
5
|
+
from dateutil import parser as dateutil_parser
|
|
6
|
+
|
|
7
|
+
MEDIA_NS = "http://search.yahoo.com/mrss/"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FastFeedParserDict(dict):
|
|
11
|
+
"""A dictionary that allows access to its keys as attributes."""
|
|
12
|
+
|
|
13
|
+
def __getattr__(self, name):
|
|
14
|
+
try:
|
|
15
|
+
return self[name]
|
|
16
|
+
except KeyError:
|
|
17
|
+
raise AttributeError(
|
|
18
|
+
f"'FastFeedParserDict' object has no attribute '{name}'"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
def __setattr__(self, name, value):
|
|
22
|
+
self[name] = value
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def parse(xml_content):
|
|
26
|
+
"""Parse the XML content of a feed."""
|
|
27
|
+
if not xml_content.strip():
|
|
28
|
+
raise ValueError("Empty content")
|
|
29
|
+
|
|
30
|
+
# Handle decoding if content is bytes
|
|
31
|
+
if isinstance(xml_content, bytes):
|
|
32
|
+
encodings = ["utf-8", "iso-8859-1", "windows-1252"]
|
|
33
|
+
decoded = None
|
|
34
|
+
for encoding in encodings:
|
|
35
|
+
try:
|
|
36
|
+
decoded = xml_content.decode(encoding)
|
|
37
|
+
break
|
|
38
|
+
except UnicodeDecodeError:
|
|
39
|
+
continue
|
|
40
|
+
if decoded is None:
|
|
41
|
+
raise ValueError("Could not decode content with any supported encoding")
|
|
42
|
+
xml_content = decoded
|
|
43
|
+
|
|
44
|
+
# Ensure we have bytes for lxml
|
|
45
|
+
xml_content = xml_content.encode("utf-8", errors="replace")
|
|
46
|
+
|
|
47
|
+
parser = etree.XMLParser(recover=True)
|
|
48
|
+
try:
|
|
49
|
+
root = etree.fromstring(xml_content, parser=parser)
|
|
50
|
+
except etree.XMLSyntaxError as e:
|
|
51
|
+
raise ValueError(f"Failed to parse XML content: {str(e)}")
|
|
52
|
+
|
|
53
|
+
# Check if root is None
|
|
54
|
+
if root is None:
|
|
55
|
+
raise ValueError("Failed to parse XML content: root element is None")
|
|
56
|
+
|
|
57
|
+
namespaces = root.nsmap
|
|
58
|
+
|
|
59
|
+
# Clean up the XML tree
|
|
60
|
+
for element in root.iter():
|
|
61
|
+
if element.text:
|
|
62
|
+
element.text = element.text.replace("\x00", "") # Remove null characters
|
|
63
|
+
if element.tail:
|
|
64
|
+
element.tail = element.tail.replace("\x00", "") # Remove null characters
|
|
65
|
+
|
|
66
|
+
feed = FastFeedParserDict()
|
|
67
|
+
entries = []
|
|
68
|
+
|
|
69
|
+
# Determine feed type based on content structure
|
|
70
|
+
if root.tag == "rss" or root.tag == f"{{{namespaces.get(None, '')}}}rss":
|
|
71
|
+
feed_type = "rss"
|
|
72
|
+
channel = root.find("channel")
|
|
73
|
+
if channel is None:
|
|
74
|
+
raise ValueError("Invalid RSS feed: missing channel element")
|
|
75
|
+
items = channel.findall("item")
|
|
76
|
+
elif (
|
|
77
|
+
root.tag.endswith("feed")
|
|
78
|
+
or root.tag
|
|
79
|
+
== f"{{{namespaces.get('atom', 'http://www.w3.org/2005/Atom')}}}feed"
|
|
80
|
+
):
|
|
81
|
+
feed_type = "atom"
|
|
82
|
+
channel = root
|
|
83
|
+
items = root.findall(
|
|
84
|
+
f".//{{{namespaces.get('atom', 'http://www.w3.org/2005/Atom')}}}entry"
|
|
85
|
+
) or root.findall("entry")
|
|
86
|
+
elif (
|
|
87
|
+
root.tag.endswith("RDF")
|
|
88
|
+
or root.tag
|
|
89
|
+
== f"{{{namespaces.get('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#')}}}RDF"
|
|
90
|
+
):
|
|
91
|
+
feed_type = "rdf"
|
|
92
|
+
channel = root
|
|
93
|
+
items = root.findall(
|
|
94
|
+
f".//{{{namespaces.get('rss', 'http://purl.org/rss/1.0/')}}}item"
|
|
95
|
+
) or root.findall("item")
|
|
96
|
+
else:
|
|
97
|
+
raise ValueError(f"Unknown feed type: {root.tag}")
|
|
98
|
+
|
|
99
|
+
if not items:
|
|
100
|
+
raise ValueError("No entries found in the feed")
|
|
101
|
+
|
|
102
|
+
def parse_feed_info(channel, feed_type, namespaces):
|
|
103
|
+
feed = FastFeedParserDict()
|
|
104
|
+
|
|
105
|
+
def get_feed_value(rss_field, atom_field, rdf_field=None, is_attr=False):
|
|
106
|
+
if feed_type == "rss":
|
|
107
|
+
value = get_element_value(channel, rss_field, namespaces) or (
|
|
108
|
+
(
|
|
109
|
+
get_element_value(
|
|
110
|
+
channel, atom_field, namespaces, attribute="href"
|
|
111
|
+
)
|
|
112
|
+
or get_element_value(
|
|
113
|
+
channel, atom_field, namespaces, attribute="link"
|
|
114
|
+
)
|
|
115
|
+
)
|
|
116
|
+
if is_attr
|
|
117
|
+
else get_element_value(channel, atom_field, namespaces)
|
|
118
|
+
)
|
|
119
|
+
elif feed_type == "atom":
|
|
120
|
+
value = get_element_value(channel, atom_field, namespaces) or (
|
|
121
|
+
(
|
|
122
|
+
get_element_value(
|
|
123
|
+
channel, atom_field, namespaces, attribute="href"
|
|
124
|
+
)
|
|
125
|
+
or get_element_value(
|
|
126
|
+
channel, atom_field, namespaces, attribute="link"
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
if is_attr
|
|
130
|
+
else ""
|
|
131
|
+
)
|
|
132
|
+
else: # RDF
|
|
133
|
+
value = (
|
|
134
|
+
get_element_value(channel, rdf_field, namespaces)
|
|
135
|
+
if rdf_field
|
|
136
|
+
else ""
|
|
137
|
+
)
|
|
138
|
+
return value if value else None
|
|
139
|
+
|
|
140
|
+
fields = [
|
|
141
|
+
(
|
|
142
|
+
"title",
|
|
143
|
+
"title",
|
|
144
|
+
"{http://www.w3.org/2005/Atom}title",
|
|
145
|
+
"{http://purl.org/rss/1.0/}channel/{http://purl.org/rss/1.0/}title",
|
|
146
|
+
),
|
|
147
|
+
(
|
|
148
|
+
"link",
|
|
149
|
+
"link",
|
|
150
|
+
"{http://www.w3.org/2005/Atom}link",
|
|
151
|
+
"{http://purl.org/rss/1.0/}channel/{http://purl.org/rss/1.0/}link",
|
|
152
|
+
True,
|
|
153
|
+
),
|
|
154
|
+
(
|
|
155
|
+
"subtitle",
|
|
156
|
+
"description",
|
|
157
|
+
"{http://www.w3.org/2005/Atom}subtitle",
|
|
158
|
+
"{http://purl.org/rss/1.0/}channel/{http://purl.org/rss/1.0/}description",
|
|
159
|
+
),
|
|
160
|
+
(
|
|
161
|
+
"generator",
|
|
162
|
+
"generator",
|
|
163
|
+
"{http://www.w3.org/2005/Atom}generator",
|
|
164
|
+
"{http://purl.org/rss/1.0/}channel/{http://webns.net/mvcb/}generatorAgent",
|
|
165
|
+
),
|
|
166
|
+
(
|
|
167
|
+
"publisher",
|
|
168
|
+
"publisher",
|
|
169
|
+
"{http://www.w3.org/2005/Atom}publisher",
|
|
170
|
+
"{http://purl.org/rss/1.0/}channel/{http://purl.org/dc/elements/1.1/}publisher",
|
|
171
|
+
),
|
|
172
|
+
(
|
|
173
|
+
"author",
|
|
174
|
+
"author",
|
|
175
|
+
"{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name",
|
|
176
|
+
"{http://purl.org/rss/1.0/}channel/{http://purl.org/dc/elements/1.1/}creator",
|
|
177
|
+
),
|
|
178
|
+
(
|
|
179
|
+
"updated",
|
|
180
|
+
"lastBuildDate",
|
|
181
|
+
"{http://www.w3.org/2005/Atom}updated",
|
|
182
|
+
"{http://purl.org/rss/1.0/}channel/{http://purl.org/dc/elements/1.1/}date",
|
|
183
|
+
),
|
|
184
|
+
]
|
|
185
|
+
|
|
186
|
+
for field in fields:
|
|
187
|
+
value = get_feed_value(*field[1:])
|
|
188
|
+
if value:
|
|
189
|
+
feed[field[0]] = value
|
|
190
|
+
|
|
191
|
+
# Add title_detail and subtitle_detail
|
|
192
|
+
if "title" in feed:
|
|
193
|
+
feed["title_detail"] = {
|
|
194
|
+
"type": "text/plain",
|
|
195
|
+
"language": channel.get("{http://www.w3.org/XML/1998/namespace}lang"),
|
|
196
|
+
"base": channel.get("{http://www.w3.org/XML/1998/namespace}base"),
|
|
197
|
+
"value": feed["title"],
|
|
198
|
+
}
|
|
199
|
+
if "subtitle" in feed:
|
|
200
|
+
feed["subtitle_detail"] = {
|
|
201
|
+
"type": "text/plain",
|
|
202
|
+
"language": channel.get("{http://www.w3.org/XML/1998/namespace}lang"),
|
|
203
|
+
"base": channel.get("{http://www.w3.org/XML/1998/namespace}base"),
|
|
204
|
+
"value": feed["subtitle"],
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
# Add links
|
|
208
|
+
feed["links"] = []
|
|
209
|
+
feed_link = None
|
|
210
|
+
for link in channel.findall("{http://www.w3.org/2005/Atom}link"):
|
|
211
|
+
rel = link.get("rel")
|
|
212
|
+
href = link.get("href")
|
|
213
|
+
if rel is None and href:
|
|
214
|
+
feed_link = href
|
|
215
|
+
elif rel not in ["hub", "self", "replies", "edit"]:
|
|
216
|
+
feed["links"].append(
|
|
217
|
+
{
|
|
218
|
+
"rel": rel,
|
|
219
|
+
"type": link.get("type"),
|
|
220
|
+
"href": href,
|
|
221
|
+
"title": link.get("title"),
|
|
222
|
+
}
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
if feed_link:
|
|
226
|
+
feed["link"] = feed_link
|
|
227
|
+
feed["links"].insert(
|
|
228
|
+
0, {"rel": "alternate", "type": "text/html", "href": feed_link}
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
# Add id
|
|
232
|
+
feed["id"] = get_element_value(
|
|
233
|
+
channel, "{http://www.w3.org/2005/Atom}id", namespaces
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Add generator_detail
|
|
237
|
+
generator = channel.find("{http://www.w3.org/2005/Atom}generator")
|
|
238
|
+
if generator is not None:
|
|
239
|
+
feed["generator_detail"] = {
|
|
240
|
+
"name": generator.text,
|
|
241
|
+
"version": generator.get("version"),
|
|
242
|
+
"href": generator.get("uri"),
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
feed["language"] = channel.get("{http://www.w3.org/XML/1998/namespace}lang")
|
|
246
|
+
feed["guidislink"] = False
|
|
247
|
+
|
|
248
|
+
if feed_type == "rss":
|
|
249
|
+
comments = get_element_value(channel, "comments", namespaces)
|
|
250
|
+
if comments:
|
|
251
|
+
feed["comments"] = comments
|
|
252
|
+
|
|
253
|
+
# Additional checks for publisher and author
|
|
254
|
+
if "publisher" not in feed:
|
|
255
|
+
webmaster = get_element_value(channel, "webMaster", namespaces)
|
|
256
|
+
if webmaster:
|
|
257
|
+
feed["publisher"] = webmaster
|
|
258
|
+
if "author" not in feed:
|
|
259
|
+
managing_editor = get_element_value(channel, "managingEditor", namespaces)
|
|
260
|
+
if managing_editor:
|
|
261
|
+
feed["author"] = managing_editor
|
|
262
|
+
|
|
263
|
+
return {"feed": feed}
|
|
264
|
+
|
|
265
|
+
feed.update(parse_feed_info(channel, feed_type, namespaces))
|
|
266
|
+
|
|
267
|
+
# Parse entries
|
|
268
|
+
def parse_feed_entry(item, feed_type, namespaces):
|
|
269
|
+
entry = FastFeedParserDict()
|
|
270
|
+
|
|
271
|
+
def get_entry_value(rss_field, atom_field, rdf_field=None, is_attr=False):
|
|
272
|
+
if feed_type == "rss":
|
|
273
|
+
value = get_element_value(item, rss_field, namespaces) or (
|
|
274
|
+
(
|
|
275
|
+
get_element_value(
|
|
276
|
+
item, atom_field, namespaces, attribute="href"
|
|
277
|
+
)
|
|
278
|
+
or get_element_value(
|
|
279
|
+
item, atom_field, namespaces, attribute="link"
|
|
280
|
+
)
|
|
281
|
+
)
|
|
282
|
+
if is_attr
|
|
283
|
+
else get_element_value(item, atom_field, namespaces)
|
|
284
|
+
)
|
|
285
|
+
elif feed_type == "atom":
|
|
286
|
+
value = get_element_value(item, atom_field, namespaces) or (
|
|
287
|
+
(
|
|
288
|
+
get_element_value(
|
|
289
|
+
item, atom_field, namespaces, attribute="href"
|
|
290
|
+
)
|
|
291
|
+
or get_element_value(
|
|
292
|
+
item, atom_field, namespaces, attribute="link"
|
|
293
|
+
)
|
|
294
|
+
)
|
|
295
|
+
if is_attr
|
|
296
|
+
else ""
|
|
297
|
+
)
|
|
298
|
+
else: # RDF
|
|
299
|
+
value = (
|
|
300
|
+
get_element_value(item, rdf_field, namespaces) if rdf_field else ""
|
|
301
|
+
)
|
|
302
|
+
return value if value else None
|
|
303
|
+
|
|
304
|
+
fields = [
|
|
305
|
+
(
|
|
306
|
+
"title",
|
|
307
|
+
"title",
|
|
308
|
+
"{http://www.w3.org/2005/Atom}title",
|
|
309
|
+
"{http://purl.org/rss/1.0/}title",
|
|
310
|
+
),
|
|
311
|
+
(
|
|
312
|
+
"link",
|
|
313
|
+
"link",
|
|
314
|
+
"{http://www.w3.org/2005/Atom}link",
|
|
315
|
+
"{http://purl.org/rss/1.0/}link",
|
|
316
|
+
True,
|
|
317
|
+
),
|
|
318
|
+
(
|
|
319
|
+
"description",
|
|
320
|
+
"description",
|
|
321
|
+
"{http://www.w3.org/2005/Atom}summary",
|
|
322
|
+
"{http://purl.org/rss/1.0/}description",
|
|
323
|
+
),
|
|
324
|
+
(
|
|
325
|
+
"published",
|
|
326
|
+
"pubDate",
|
|
327
|
+
"{http://www.w3.org/2005/Atom}published",
|
|
328
|
+
"{http://purl.org/dc/elements/1.1/}date",
|
|
329
|
+
),
|
|
330
|
+
(
|
|
331
|
+
"updated",
|
|
332
|
+
"lastBuildDate",
|
|
333
|
+
"{http://www.w3.org/2005/Atom}updated",
|
|
334
|
+
"{http://purl.org/dc/terms/}modified",
|
|
335
|
+
),
|
|
336
|
+
]
|
|
337
|
+
|
|
338
|
+
for field in fields:
|
|
339
|
+
value = get_entry_value(*field[1:])
|
|
340
|
+
if value:
|
|
341
|
+
if field[0] in ["published", "updated"]:
|
|
342
|
+
value = parse_date(value)
|
|
343
|
+
entry[field[0]] = value
|
|
344
|
+
|
|
345
|
+
# If published is missing but updated exists, use updated as published
|
|
346
|
+
if "updated" in entry and "published" not in entry:
|
|
347
|
+
entry["published"] = entry["updated"]
|
|
348
|
+
|
|
349
|
+
# Handle links
|
|
350
|
+
entry["links"] = []
|
|
351
|
+
alternate_link = None
|
|
352
|
+
for link in item.findall("{http://www.w3.org/2005/Atom}link"):
|
|
353
|
+
rel = link.get("rel")
|
|
354
|
+
href = link.get("href") or link.get(
|
|
355
|
+
"link"
|
|
356
|
+
) # Check both 'href' and 'link' attributes
|
|
357
|
+
if href:
|
|
358
|
+
if rel == "alternate":
|
|
359
|
+
alternate_link = {
|
|
360
|
+
"rel": rel,
|
|
361
|
+
"type": link.get("type"),
|
|
362
|
+
"href": href,
|
|
363
|
+
"title": link.get("title"),
|
|
364
|
+
}
|
|
365
|
+
elif rel not in ["edit", "self"]:
|
|
366
|
+
entry["links"].append(
|
|
367
|
+
{
|
|
368
|
+
"rel": rel,
|
|
369
|
+
"type": link.get("type"),
|
|
370
|
+
"href": href,
|
|
371
|
+
"title": link.get("title"),
|
|
372
|
+
}
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# Check for guid that looks like a URL
|
|
376
|
+
guid = item.find("guid")
|
|
377
|
+
guid_text = guid.text.strip() if guid is not None and guid.text else None
|
|
378
|
+
is_guid_url = guid_text and (
|
|
379
|
+
guid_text.startswith("http://") or guid_text.startswith("https://")
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
if is_guid_url:
|
|
383
|
+
# Prefer guid as link when it looks like a URL
|
|
384
|
+
entry["link"] = guid_text
|
|
385
|
+
if alternate_link:
|
|
386
|
+
entry["links"].insert(
|
|
387
|
+
0, {"rel": "alternate", "type": "text/html", "href": guid_text}
|
|
388
|
+
)
|
|
389
|
+
elif alternate_link:
|
|
390
|
+
entry["links"].insert(0, alternate_link)
|
|
391
|
+
entry["link"] = alternate_link["href"]
|
|
392
|
+
elif (
|
|
393
|
+
"link" not in entry
|
|
394
|
+
and guid is not None
|
|
395
|
+
and guid.get("isPermaLink") == "true"
|
|
396
|
+
):
|
|
397
|
+
entry["link"] = guid.text
|
|
398
|
+
|
|
399
|
+
content = None
|
|
400
|
+
if feed_type == "rss":
|
|
401
|
+
content = item.find("{http://purl.org/rss/1.0/modules/content/}encoded")
|
|
402
|
+
if content is None:
|
|
403
|
+
for ns, uri in namespaces.items():
|
|
404
|
+
if uri == "http://purl.org/rss/1.0/modules/content/":
|
|
405
|
+
content = item.find(f"{{{uri}}}encoded")
|
|
406
|
+
break
|
|
407
|
+
if content is None:
|
|
408
|
+
content = item.find("content")
|
|
409
|
+
elif feed_type == "atom":
|
|
410
|
+
content = item.find("{http://www.w3.org/2005/Atom}content")
|
|
411
|
+
|
|
412
|
+
if content is not None:
|
|
413
|
+
content_type = content.get("type", "text/html") # Default to text/html
|
|
414
|
+
if content_type in ["html", "xhtml"]:
|
|
415
|
+
# For XHTML content, serialize the entire content
|
|
416
|
+
content_value = etree.tostring(
|
|
417
|
+
content, encoding="unicode", method="xml"
|
|
418
|
+
)
|
|
419
|
+
else:
|
|
420
|
+
content_value = content.text if content.text else ""
|
|
421
|
+
|
|
422
|
+
entry["content"] = [
|
|
423
|
+
{
|
|
424
|
+
"type": content_type,
|
|
425
|
+
"language": content.get(
|
|
426
|
+
"{http://www.w3.org/XML/1998/namespace}lang"
|
|
427
|
+
),
|
|
428
|
+
"base": content.get("{http://www.w3.org/XML/1998/namespace}base"),
|
|
429
|
+
"value": content_value,
|
|
430
|
+
}
|
|
431
|
+
]
|
|
432
|
+
|
|
433
|
+
# If content is still empty, try to use description
|
|
434
|
+
if "content" not in entry or not entry["content"]:
|
|
435
|
+
description = item.find("description")
|
|
436
|
+
if description is not None and description.text:
|
|
437
|
+
entry["content"] = [{"type": "text/html", "value": description.text}]
|
|
438
|
+
|
|
439
|
+
if ("description" not in entry) and ("content" in entry or "summary" in entry):
|
|
440
|
+
content = (
|
|
441
|
+
entry.get("content", [{}])[0].get("value", "")
|
|
442
|
+
if entry.get("content")
|
|
443
|
+
else ""
|
|
444
|
+
)
|
|
445
|
+
if content:
|
|
446
|
+
try:
|
|
447
|
+
html_content = etree.HTML(content)
|
|
448
|
+
if html_content is not None:
|
|
449
|
+
content_text = html_content.xpath("string()")
|
|
450
|
+
entry["description"] = " ".join(content_text.split()[:256])
|
|
451
|
+
else:
|
|
452
|
+
entry["description"] = content[:512]
|
|
453
|
+
except etree.ParserError:
|
|
454
|
+
entry["description"] = content[:512]
|
|
455
|
+
else:
|
|
456
|
+
entry["description"] = entry.get("summary", "")[:512]
|
|
457
|
+
|
|
458
|
+
# Handle media content
|
|
459
|
+
media_contents = []
|
|
460
|
+
|
|
461
|
+
# Process media:content elements
|
|
462
|
+
for media in item.findall(f".//{{{MEDIA_NS}}}content"):
|
|
463
|
+
media_item = {
|
|
464
|
+
"url": media.get("url"),
|
|
465
|
+
"type": media.get("type"),
|
|
466
|
+
"medium": media.get("medium"),
|
|
467
|
+
"width": media.get("width"),
|
|
468
|
+
"height": media.get("height"),
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
# Convert width/height to integers if present
|
|
472
|
+
for dim in ["width", "height"]:
|
|
473
|
+
if media_item.get(dim):
|
|
474
|
+
try:
|
|
475
|
+
media_item[dim] = int(media_item[dim])
|
|
476
|
+
except (ValueError, TypeError):
|
|
477
|
+
del media_item[dim]
|
|
478
|
+
|
|
479
|
+
# Handle sibling elements
|
|
480
|
+
# Handle title
|
|
481
|
+
title = media.find(f"{{{MEDIA_NS}}}title")
|
|
482
|
+
if title is not None and title.text:
|
|
483
|
+
media_item["title"] = title.text.strip()
|
|
484
|
+
|
|
485
|
+
# Handle credit
|
|
486
|
+
credit = media.find(f"{{{MEDIA_NS}}}credit")
|
|
487
|
+
if credit is not None:
|
|
488
|
+
media_item["credit"] = credit.text.strip() if credit.text else None
|
|
489
|
+
media_item["credit_scheme"] = credit.get("scheme")
|
|
490
|
+
|
|
491
|
+
# Handle text
|
|
492
|
+
text = media.find(f"{{{MEDIA_NS}}}text")
|
|
493
|
+
if text is not None and text.text:
|
|
494
|
+
media_item["text"] = text.text.strip()
|
|
495
|
+
|
|
496
|
+
# Handle description - check both direct child and sibling elements
|
|
497
|
+
desc = media.find(f"{{{MEDIA_NS}}}description")
|
|
498
|
+
if desc is None:
|
|
499
|
+
desc = media.getparent().find(f"{{{MEDIA_NS}}}description")
|
|
500
|
+
if desc is not None and desc.text:
|
|
501
|
+
media_item["description"] = desc.text.strip()
|
|
502
|
+
|
|
503
|
+
# Handle credit - check both direct child and sibling elements
|
|
504
|
+
credit = media.find(f"{{{MEDIA_NS}}}credit")
|
|
505
|
+
if credit is None:
|
|
506
|
+
credit = media.getparent().find(f"{{{MEDIA_NS}}}credit")
|
|
507
|
+
if credit is not None and credit.text:
|
|
508
|
+
media_item["credit"] = credit.text.strip()
|
|
509
|
+
|
|
510
|
+
# Handle thumbnail as a separate URL field
|
|
511
|
+
thumbnail = media.find(f"{{{MEDIA_NS}}}thumbnail")
|
|
512
|
+
if thumbnail is not None:
|
|
513
|
+
media_item["thumbnail_url"] = thumbnail.get("url")
|
|
514
|
+
|
|
515
|
+
# Remove None values
|
|
516
|
+
media_item = {k: v for k, v in media_item.items() if v is not None}
|
|
517
|
+
|
|
518
|
+
if media_item: # Only append if we have some content
|
|
519
|
+
media_contents.append(media_item)
|
|
520
|
+
|
|
521
|
+
# If no media:content but there are standalone thumbnails, add them
|
|
522
|
+
if not media_contents:
|
|
523
|
+
for thumbnail in item.findall(f".//{{{MEDIA_NS}}}thumbnail"):
|
|
524
|
+
if thumbnail.getparent().tag != f"{{{MEDIA_NS}}}content":
|
|
525
|
+
thumb_item = {
|
|
526
|
+
"url": thumbnail.get("url"),
|
|
527
|
+
"type": "image/jpeg", # Default type for thumbnails
|
|
528
|
+
"width": thumbnail.get("width"),
|
|
529
|
+
"height": thumbnail.get("height"),
|
|
530
|
+
}
|
|
531
|
+
# Convert dimensions to integers if present
|
|
532
|
+
for dim in ["width", "height"]:
|
|
533
|
+
if thumb_item.get(dim):
|
|
534
|
+
try:
|
|
535
|
+
thumb_item[dim] = int(thumb_item[dim])
|
|
536
|
+
except (ValueError, TypeError):
|
|
537
|
+
del thumb_item[dim]
|
|
538
|
+
|
|
539
|
+
# Remove None values
|
|
540
|
+
thumb_item = {k: v for k, v in thumb_item.items() if v is not None}
|
|
541
|
+
|
|
542
|
+
if thumb_item:
|
|
543
|
+
media_contents.append(thumb_item)
|
|
544
|
+
|
|
545
|
+
if media_contents:
|
|
546
|
+
entry["media_content"] = media_contents
|
|
547
|
+
|
|
548
|
+
# Handle enclosures
|
|
549
|
+
enclosures = []
|
|
550
|
+
for enclosure in item.findall("enclosure"):
|
|
551
|
+
enc_item = {
|
|
552
|
+
"url": enclosure.get("url"),
|
|
553
|
+
"type": enclosure.get("type"),
|
|
554
|
+
"length": enclosure.get("length"),
|
|
555
|
+
}
|
|
556
|
+
# Convert length to integer if present and valid
|
|
557
|
+
if enc_item["length"]:
|
|
558
|
+
try:
|
|
559
|
+
enc_item["length"] = int(enc_item["length"])
|
|
560
|
+
except (ValueError, TypeError):
|
|
561
|
+
del enc_item["length"]
|
|
562
|
+
|
|
563
|
+
# Remove None values
|
|
564
|
+
enc_item = {k: v for k, v in enc_item.items() if v is not None}
|
|
565
|
+
|
|
566
|
+
if enc_item.get("url"): # Only append if we have a URL
|
|
567
|
+
enclosures.append(enc_item)
|
|
568
|
+
|
|
569
|
+
if enclosures:
|
|
570
|
+
entry["enclosures"] = enclosures
|
|
571
|
+
|
|
572
|
+
author = (
|
|
573
|
+
get_entry_value(
|
|
574
|
+
"author",
|
|
575
|
+
"{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name",
|
|
576
|
+
"{http://purl.org/dc/elements/1.1/}creator",
|
|
577
|
+
)
|
|
578
|
+
or get_entry_value(
|
|
579
|
+
"dc:creator",
|
|
580
|
+
"{http://purl.org/dc/elements/1.1/}creator",
|
|
581
|
+
"{http://purl.org/dc/elements/1.1/}creator",
|
|
582
|
+
)
|
|
583
|
+
or get_element_value(
|
|
584
|
+
item, "{http://purl.org/dc/elements/1.1/}creator", namespaces
|
|
585
|
+
)
|
|
586
|
+
or get_element_value(item, "author", namespaces)
|
|
587
|
+
)
|
|
588
|
+
if author:
|
|
589
|
+
entry["author"] = author
|
|
590
|
+
|
|
591
|
+
if feed_type == "rss":
|
|
592
|
+
comments = get_element_value(item, "comments", namespaces)
|
|
593
|
+
if comments:
|
|
594
|
+
entry["comments"] = comments
|
|
595
|
+
|
|
596
|
+
return entry
|
|
597
|
+
|
|
598
|
+
# Usage:
|
|
599
|
+
for item in items:
|
|
600
|
+
entry = parse_feed_entry(item, feed_type, namespaces)
|
|
601
|
+
entries.append(entry)
|
|
602
|
+
|
|
603
|
+
# Trim titles and descriptions
|
|
604
|
+
for entry in entries:
|
|
605
|
+
entry["title"] = trim_text(entry.get("title", ""))
|
|
606
|
+
entry["description"] = trim_text(entry.get("description", ""))
|
|
607
|
+
entry["summary"] = trim_text(entry.get("summary", ""))
|
|
608
|
+
|
|
609
|
+
feed["entries"] = entries
|
|
610
|
+
return feed
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
def trim_text(text):
|
|
614
|
+
"""Trim leading and trailing whitespace from text."""
|
|
615
|
+
return text.strip() if text else ""
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def get_element_value(element, tag, namespaces, attribute=None):
|
|
619
|
+
"""Get text content or attribute value of an element."""
|
|
620
|
+
if ":" in tag and not tag.startswith("{"):
|
|
621
|
+
prefix, tag_name = tag.split(":")
|
|
622
|
+
uri = namespaces.get(prefix, "")
|
|
623
|
+
tag = f"{{{uri}}}{tag_name}"
|
|
624
|
+
el = element.find(tag)
|
|
625
|
+
if el is not None:
|
|
626
|
+
if attribute:
|
|
627
|
+
return el.get(attribute)
|
|
628
|
+
else:
|
|
629
|
+
return el.text
|
|
630
|
+
return None
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
# Initialize parsedatetime Calendar
|
|
634
|
+
cal = parsedatetime.Calendar()
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def parse_date(date_str):
|
|
638
|
+
"""Parse date string and return as a standard string in UTC."""
|
|
639
|
+
if not date_str:
|
|
640
|
+
return None
|
|
641
|
+
|
|
642
|
+
# Try dateutil.parser first
|
|
643
|
+
try:
|
|
644
|
+
dt = dateutil_parser.parse(date_str)
|
|
645
|
+
if dt.tzinfo is None:
|
|
646
|
+
dt = dt.replace(tzinfo=datetime.timezone.utc)
|
|
647
|
+
else:
|
|
648
|
+
dt = dt.astimezone(datetime.timezone.utc)
|
|
649
|
+
return dt.strftime("%Y-%m-%d %H:%M:%S %Z")
|
|
650
|
+
except (ValueError, OverflowError):
|
|
651
|
+
pass
|
|
652
|
+
|
|
653
|
+
# Fall back to parsedatetime
|
|
654
|
+
try:
|
|
655
|
+
time_struct, parse_status = cal.parse(date_str)
|
|
656
|
+
if parse_status:
|
|
657
|
+
dt = datetime.datetime(*time_struct[:6], tzinfo=datetime.timezone.utc)
|
|
658
|
+
return dt.strftime("%Y-%m-%d %H:%M:%S %Z")
|
|
659
|
+
except ValueError:
|
|
660
|
+
pass
|
|
661
|
+
|
|
662
|
+
# If all parsing attempts fail, return the original string
|
|
663
|
+
return date_str
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
def parse_url(url):
|
|
667
|
+
"""Parse a URL and return a FastFeedParserDict object."""
|
|
668
|
+
return parse(url)
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def fetch_url(url):
|
|
672
|
+
"""Fetch content from a URL."""
|
|
673
|
+
with httpx.Client() as client:
|
|
674
|
+
response = client.get(url, follow_redirects=True)
|
|
675
|
+
response.raise_for_status()
|
|
676
|
+
return response.text
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: fastfeedparser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: High performance RSS, Atom and RDF parser in Python
|
|
5
|
+
Home-page: https://github.com/kagi-search/fastfeedparser
|
|
6
|
+
Author: Vladimir Prelovac
|
|
7
|
+
Author-email: vlad@kagi.com
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/kagi-search/fastfeedparser/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Development Status :: 4 - Beta
|
|
18
|
+
Classifier: Intended Audience :: Developers
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
21
|
+
Requires-Python: >=3.7
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: httpx
|
|
25
|
+
Requires-Dist: lxml
|
|
26
|
+
Requires-Dist: parsedatetime
|
|
27
|
+
Requires-Dist: python-dateutil
|
|
28
|
+
|
|
29
|
+
# FastFeedParser
|
|
30
|
+
|
|
31
|
+
A high-performance RSS, Atom, and RDF feed parser for Python. FastFeedParser is designed to be fast, memory-efficient, and easy to use while providing comprehensive feed parsing capabilities.
|
|
32
|
+
|
|
33
|
+
### Why FastFeedParser?
|
|
34
|
+
|
|
35
|
+
The main advantage of FastFeedParser over the traditional feedparser library is its lightweight design and exceptional performance - benchmarks show it's 10x-100x faster than feedparser while maintaining a familiar API. This dramatic speed improvement is achieved through:
|
|
36
|
+
|
|
37
|
+
- Efficient XML parsing using lxml
|
|
38
|
+
- Optimized memory usage
|
|
39
|
+
- Minimal dependencies
|
|
40
|
+
- Streamlined codebase focused on core functionality
|
|
41
|
+
|
|
42
|
+
## Features
|
|
43
|
+
|
|
44
|
+
- Fast parsing of RSS 2.0, Atom 1.0, and RDF/RSS 1.0 feeds
|
|
45
|
+
- Robust error handling and encoding detection
|
|
46
|
+
- Support for media content and enclosures
|
|
47
|
+
- Automatic date parsing with timezone handling
|
|
48
|
+
- Clean, Pythonic API similar to feedparser
|
|
49
|
+
- Comprehensive handling of feed metadata
|
|
50
|
+
- Support for various feed extensions (Media RSS, Dublin Core, etc.)
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install fastfeedparser
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Quick Start
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
import fastfeedparser
|
|
62
|
+
|
|
63
|
+
# Parse from URL
|
|
64
|
+
feed = fastfeedparser.parse_url('https://example.com/feed.xml')
|
|
65
|
+
|
|
66
|
+
# Parse from string
|
|
67
|
+
xml_content = '''<?xml version="1.0"?>
|
|
68
|
+
<rss version="2.0">
|
|
69
|
+
<channel>
|
|
70
|
+
<title>Example Feed</title>
|
|
71
|
+
...
|
|
72
|
+
</channel>
|
|
73
|
+
</rss>'''
|
|
74
|
+
feed = fastfeedparser.parse(xml_content)
|
|
75
|
+
|
|
76
|
+
# Access feed information
|
|
77
|
+
print(feed.feed.title)
|
|
78
|
+
print(feed.feed.link)
|
|
79
|
+
|
|
80
|
+
# Access entries
|
|
81
|
+
for entry in feed.entries:
|
|
82
|
+
print(entry.title)
|
|
83
|
+
print(entry.link)
|
|
84
|
+
print(entry.published)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Key Features
|
|
88
|
+
|
|
89
|
+
### Feed Types Support
|
|
90
|
+
- RSS 2.0
|
|
91
|
+
- Atom 1.0
|
|
92
|
+
- RDF/RSS 1.0
|
|
93
|
+
|
|
94
|
+
### Content Handling
|
|
95
|
+
- Automatic encoding detection
|
|
96
|
+
- HTML content parsing
|
|
97
|
+
- Media content extraction
|
|
98
|
+
- Enclosure handling
|
|
99
|
+
|
|
100
|
+
### Metadata Support
|
|
101
|
+
- Feed title, link, and description
|
|
102
|
+
- Publication dates
|
|
103
|
+
- Author information
|
|
104
|
+
- Categories and tags
|
|
105
|
+
- Media content and thumbnails
|
|
106
|
+
|
|
107
|
+
## API Reference
|
|
108
|
+
|
|
109
|
+
### Main Functions
|
|
110
|
+
|
|
111
|
+
- `parse(xml_content)`: Parse feed from a string or bytes
|
|
112
|
+
- `parse_url(url)`: Parse feed from a URL
|
|
113
|
+
- `fetch_url(url)`: Fetch content from a URL
|
|
114
|
+
|
|
115
|
+
### Feed Object Structure
|
|
116
|
+
|
|
117
|
+
The parser returns a `FastFeedParserDict` object with two main sections:
|
|
118
|
+
|
|
119
|
+
- `feed`: Contains feed-level metadata
|
|
120
|
+
- `entries`: List of feed entries
|
|
121
|
+
|
|
122
|
+
Each entry contains:
|
|
123
|
+
- `title`: Entry title
|
|
124
|
+
- `link`: Entry URL
|
|
125
|
+
- `description`: Entry description/summary
|
|
126
|
+
- `published`: Publication date
|
|
127
|
+
- `author`: Author information
|
|
128
|
+
- `content`: Full content
|
|
129
|
+
- `media_content`: Media attachments
|
|
130
|
+
- `enclosures`: Attached files
|
|
131
|
+
|
|
132
|
+
## Requirements
|
|
133
|
+
|
|
134
|
+
- Python 3.7+
|
|
135
|
+
- httpx
|
|
136
|
+
- lxml
|
|
137
|
+
- parsedatetime
|
|
138
|
+
- python-dateutil
|
|
139
|
+
|
|
140
|
+
## Contributing
|
|
141
|
+
|
|
142
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
143
|
+
|
|
144
|
+
## License
|
|
145
|
+
|
|
146
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
147
|
+
|
|
148
|
+
## Acknowledgments
|
|
149
|
+
|
|
150
|
+
Inspired by the Universal Feed Parser (feedparser) project, FastFeedParser aims to provide a modern, high-performance alternative while maintaining a familiar API.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.cfg
|
|
5
|
+
src/fastfeedparser/__init__.py
|
|
6
|
+
src/fastfeedparser/main.py
|
|
7
|
+
src/fastfeedparser.egg-info/PKG-INFO
|
|
8
|
+
src/fastfeedparser.egg-info/SOURCES.txt
|
|
9
|
+
src/fastfeedparser.egg-info/dependency_links.txt
|
|
10
|
+
src/fastfeedparser.egg-info/requires.txt
|
|
11
|
+
src/fastfeedparser.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
fastfeedparser
|