google-news-api 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- google_news_api-0.0.1/LICENSE +21 -0
- google_news_api-0.0.1/PKG-INFO +135 -0
- google_news_api-0.0.1/README.md +117 -0
- google_news_api-0.0.1/google_news_api/__init__.py +35 -0
- google_news_api-0.0.1/google_news_api/client.py +392 -0
- google_news_api-0.0.1/google_news_api/config.py +105 -0
- google_news_api-0.0.1/google_news_api/exceptions.py +114 -0
- google_news_api-0.0.1/google_news_api/logging.py +117 -0
- google_news_api-0.0.1/google_news_api/utils.py +340 -0
- google_news_api-0.0.1/pyproject.toml +28 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Paolo Mazza
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: google-news-api
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary:
|
|
5
|
+
Author: Paolo Mazza
|
|
6
|
+
Author-email: mazzapaolo2019@gmail.com
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Requires-Dist: feedparser (>=6.0.11,<7.0.0)
|
|
15
|
+
Requires-Dist: httpx (>=0.28.1,<0.29.0)
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# Google News API Client
|
|
19
|
+
|
|
20
|
+
A robust Python client library for the Google News RSS feed API that provides both synchronous and asynchronous implementations with built-in rate limiting, caching, and error handling.
|
|
21
|
+
|
|
22
|
+
## Features
|
|
23
|
+
|
|
24
|
+
- ✨ Comprehensive news search and retrieval functionality
|
|
25
|
+
- 🔄 Both synchronous and asynchronous APIs
|
|
26
|
+
- 🚀 High performance with in-memory caching (TTL-based)
|
|
27
|
+
- 🛡️ Built-in rate limiting with token bucket algorithm
|
|
28
|
+
- 🔁 Automatic retries with exponential backoff
|
|
29
|
+
- 🌍 Multi-language and country support
|
|
30
|
+
- 🛠️ Robust error handling and validation
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install -r requirements.txt
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Quick Start
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from google_news_api import GoogleNewsClient
|
|
42
|
+
|
|
43
|
+
# Initialize client
|
|
44
|
+
client = GoogleNewsClient(language="en", country="US")
|
|
45
|
+
|
|
46
|
+
# Search for news
|
|
47
|
+
articles = client.search("artificial intelligence", max_results=5)
|
|
48
|
+
for article in articles:
|
|
49
|
+
print(f"{article['title']} - {article['source']}")
|
|
50
|
+
|
|
51
|
+
# Get top headlines
|
|
52
|
+
top_news = client.top_news(max_results=3)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
For async usage:
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from google_news_api import AsyncGoogleNewsClient
|
|
59
|
+
import asyncio
|
|
60
|
+
|
|
61
|
+
async def main():
|
|
62
|
+
async with AsyncGoogleNewsClient() as client:
|
|
63
|
+
articles = await client.search("python programming")
|
|
64
|
+
print(f"Found {len(articles)} articles")
|
|
65
|
+
|
|
66
|
+
asyncio.run(main())
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Configuration Options
|
|
70
|
+
|
|
71
|
+
| Parameter | Description | Default | Example |
|
|
72
|
+
|-----------|-------------|---------|---------|
|
|
73
|
+
| `language` | Two-letter language code (ISO 639-1) | `"en"` | `"es"`, `"fr"` |
|
|
74
|
+
| `country` | Two-letter country code (ISO 3166-1) | `"US"` | `"GB"`, `"DE"` |
|
|
75
|
+
| `requests_per_minute` | Rate limit threshold | `60` | `30`, `100` |
|
|
76
|
+
| `cache_ttl` | Cache duration in seconds | `300` | `600`, `1800` |
|
|
77
|
+
|
|
78
|
+
## Error Handling
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from google_news_api.exceptions import (
|
|
82
|
+
ConfigurationError,
|
|
83
|
+
ValidationError,
|
|
84
|
+
HTTPError,
|
|
85
|
+
RateLimitError,
|
|
86
|
+
ParsingError
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
articles = client.search("technology")
|
|
91
|
+
except ValidationError:
|
|
92
|
+
# Handle invalid search parameters
|
|
93
|
+
except RateLimitError:
|
|
94
|
+
# Handle rate limit exceeded
|
|
95
|
+
except HTTPError:
|
|
96
|
+
# Handle network/server issues
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Best Practices
|
|
100
|
+
|
|
101
|
+
### Resource Management
|
|
102
|
+
- Use context managers for async clients
|
|
103
|
+
- Properly close sync clients when done
|
|
104
|
+
- Implement appropriate error handling
|
|
105
|
+
|
|
106
|
+
### Performance Optimization
|
|
107
|
+
- Leverage caching for frequently accessed data
|
|
108
|
+
- Use async client for concurrent operations
|
|
109
|
+
- Group related requests to maximize cache hits
|
|
110
|
+
|
|
111
|
+
### Rate Limiting
|
|
112
|
+
- Configure `requests_per_minute` based on needs
|
|
113
|
+
- Handle rate limit errors gracefully
|
|
114
|
+
- Implement backoff strategies for retries
|
|
115
|
+
|
|
116
|
+
## Contributing
|
|
117
|
+
|
|
118
|
+
1. Fork the repository
|
|
119
|
+
2. Create a feature branch
|
|
120
|
+
3. Commit your changes
|
|
121
|
+
4. Push to the branch
|
|
122
|
+
5. Create a Pull Request
|
|
123
|
+
|
|
124
|
+
## License
|
|
125
|
+
|
|
126
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
127
|
+
|
|
128
|
+
## Author
|
|
129
|
+
|
|
130
|
+
Paolo Mazza (mazzapaolo2019@gmail.com)
|
|
131
|
+
|
|
132
|
+
## Support
|
|
133
|
+
|
|
134
|
+
For issues and feature requests, please use the GitHub issue tracker.
|
|
135
|
+
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# Google News API Client
|
|
2
|
+
|
|
3
|
+
A robust Python client library for the Google News RSS feed API that provides both synchronous and asynchronous implementations with built-in rate limiting, caching, and error handling.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- ✨ Comprehensive news search and retrieval functionality
|
|
8
|
+
- 🔄 Both synchronous and asynchronous APIs
|
|
9
|
+
- 🚀 High performance with in-memory caching (TTL-based)
|
|
10
|
+
- 🛡️ Built-in rate limiting with token bucket algorithm
|
|
11
|
+
- 🔁 Automatic retries with exponential backoff
|
|
12
|
+
- 🌍 Multi-language and country support
|
|
13
|
+
- 🛠️ Robust error handling and validation
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install -r requirements.txt
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from google_news_api import GoogleNewsClient
|
|
25
|
+
|
|
26
|
+
# Initialize client
|
|
27
|
+
client = GoogleNewsClient(language="en", country="US")
|
|
28
|
+
|
|
29
|
+
# Search for news
|
|
30
|
+
articles = client.search("artificial intelligence", max_results=5)
|
|
31
|
+
for article in articles:
|
|
32
|
+
print(f"{article['title']} - {article['source']}")
|
|
33
|
+
|
|
34
|
+
# Get top headlines
|
|
35
|
+
top_news = client.top_news(max_results=3)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
For async usage:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from google_news_api import AsyncGoogleNewsClient
|
|
42
|
+
import asyncio
|
|
43
|
+
|
|
44
|
+
async def main():
|
|
45
|
+
async with AsyncGoogleNewsClient() as client:
|
|
46
|
+
articles = await client.search("python programming")
|
|
47
|
+
print(f"Found {len(articles)} articles")
|
|
48
|
+
|
|
49
|
+
asyncio.run(main())
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Configuration Options
|
|
53
|
+
|
|
54
|
+
| Parameter | Description | Default | Example |
|
|
55
|
+
|-----------|-------------|---------|---------|
|
|
56
|
+
| `language` | Two-letter language code (ISO 639-1) | `"en"` | `"es"`, `"fr"` |
|
|
57
|
+
| `country` | Two-letter country code (ISO 3166-1) | `"US"` | `"GB"`, `"DE"` |
|
|
58
|
+
| `requests_per_minute` | Rate limit threshold | `60` | `30`, `100` |
|
|
59
|
+
| `cache_ttl` | Cache duration in seconds | `300` | `600`, `1800` |
|
|
60
|
+
|
|
61
|
+
## Error Handling
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from google_news_api.exceptions import (
|
|
65
|
+
ConfigurationError,
|
|
66
|
+
ValidationError,
|
|
67
|
+
HTTPError,
|
|
68
|
+
RateLimitError,
|
|
69
|
+
ParsingError
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
articles = client.search("technology")
|
|
74
|
+
except ValidationError:
|
|
75
|
+
# Handle invalid search parameters
|
|
76
|
+
except RateLimitError:
|
|
77
|
+
# Handle rate limit exceeded
|
|
78
|
+
except HTTPError:
|
|
79
|
+
# Handle network/server issues
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Best Practices
|
|
83
|
+
|
|
84
|
+
### Resource Management
|
|
85
|
+
- Use context managers for async clients
|
|
86
|
+
- Properly close sync clients when done
|
|
87
|
+
- Implement appropriate error handling
|
|
88
|
+
|
|
89
|
+
### Performance Optimization
|
|
90
|
+
- Leverage caching for frequently accessed data
|
|
91
|
+
- Use async client for concurrent operations
|
|
92
|
+
- Group related requests to maximize cache hits
|
|
93
|
+
|
|
94
|
+
### Rate Limiting
|
|
95
|
+
- Configure `requests_per_minute` based on needs
|
|
96
|
+
- Handle rate limit errors gracefully
|
|
97
|
+
- Implement backoff strategies for retries
|
|
98
|
+
|
|
99
|
+
## Contributing
|
|
100
|
+
|
|
101
|
+
1. Fork the repository
|
|
102
|
+
2. Create a feature branch
|
|
103
|
+
3. Commit your changes
|
|
104
|
+
4. Push to the branch
|
|
105
|
+
5. Create a Pull Request
|
|
106
|
+
|
|
107
|
+
## License
|
|
108
|
+
|
|
109
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
110
|
+
|
|
111
|
+
## Author
|
|
112
|
+
|
|
113
|
+
Paolo Mazza (mazzapaolo2019@gmail.com)
|
|
114
|
+
|
|
115
|
+
## Support
|
|
116
|
+
|
|
117
|
+
For issues and feature requests, please use the GitHub issue tracker.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Google News API package."""
|
|
2
|
+
|
|
3
|
+
from .client import AsyncGoogleNewsClient, GoogleNewsClient
|
|
4
|
+
from .config import ClientConfig
|
|
5
|
+
from .exceptions import (
|
|
6
|
+
ConfigurationError,
|
|
7
|
+
GoogleNewsError,
|
|
8
|
+
HTTPError,
|
|
9
|
+
ParsingError,
|
|
10
|
+
RateLimitError,
|
|
11
|
+
ValidationError,
|
|
12
|
+
)
|
|
13
|
+
from .logging import setup_logging
|
|
14
|
+
from .utils import AsyncCache, AsyncRateLimiter, Cache, RateLimiter
|
|
15
|
+
|
|
16
|
+
__version__ = "0.1.0"
|
|
17
|
+
__author__ = "Paolo Mazza"
|
|
18
|
+
__email__ = "mazzapaolo2019@gmail.com"
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"AsyncGoogleNewsClient",
|
|
22
|
+
"GoogleNewsClient",
|
|
23
|
+
"ClientConfig",
|
|
24
|
+
"ConfigurationError",
|
|
25
|
+
"GoogleNewsError",
|
|
26
|
+
"HTTPError",
|
|
27
|
+
"ParsingError",
|
|
28
|
+
"RateLimitError",
|
|
29
|
+
"ValidationError",
|
|
30
|
+
"setup_logging",
|
|
31
|
+
"AsyncCache",
|
|
32
|
+
"AsyncRateLimiter",
|
|
33
|
+
"Cache",
|
|
34
|
+
"RateLimiter",
|
|
35
|
+
]
|
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
"""Google News API client implementations.
|
|
2
|
+
|
|
3
|
+
Provides synchronous and asynchronous clients for
|
|
4
|
+
Google News RSS feed API with rate limiting, caching,
|
|
5
|
+
and automatic retries. See GoogleNewsClient and
|
|
6
|
+
AsyncGoogleNewsClient for usage.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import platform
|
|
11
|
+
import random
|
|
12
|
+
from abc import ABC, abstractmethod
|
|
13
|
+
from typing import Any, Dict, List, Optional
|
|
14
|
+
from urllib.parse import urlencode
|
|
15
|
+
|
|
16
|
+
import feedparser
|
|
17
|
+
import httpx
|
|
18
|
+
from feedparser import FeedParserDict
|
|
19
|
+
|
|
20
|
+
from .exceptions import (
|
|
21
|
+
ConfigurationError,
|
|
22
|
+
HTTPError,
|
|
23
|
+
ParsingError,
|
|
24
|
+
RateLimitError,
|
|
25
|
+
ValidationError,
|
|
26
|
+
)
|
|
27
|
+
from .utils import (
|
|
28
|
+
AsyncCache,
|
|
29
|
+
AsyncRateLimiter,
|
|
30
|
+
Cache,
|
|
31
|
+
RateLimiter,
|
|
32
|
+
retry_async,
|
|
33
|
+
retry_sync,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _generate_chrome_version():
|
|
40
|
+
"""Generate a plausible Chrome version number."""
|
|
41
|
+
major = 122
|
|
42
|
+
build = random.randint(0, 5000)
|
|
43
|
+
patch = random.randint(0, 300)
|
|
44
|
+
return f"{major}.0.{build}.{patch}"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _get_platform_info():
|
|
48
|
+
"""Get platform-specific browser info string."""
|
|
49
|
+
system = platform.system()
|
|
50
|
+
if system == "Windows":
|
|
51
|
+
return "Windows NT 10.0; Win64; x64"
|
|
52
|
+
elif system == "Darwin":
|
|
53
|
+
return "Macintosh; Intel Mac OS X 10_15_7"
|
|
54
|
+
else:
|
|
55
|
+
return "X11; Linux x86_64"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
CHROME_HEADERS = {
|
|
59
|
+
"User-Agent": (
|
|
60
|
+
f"Mozilla/5.0 ({_get_platform_info()}) AppleWebKit/537.36 "
|
|
61
|
+
f"(KHTML, like Gecko) Chrome/{_generate_chrome_version()} Safari/537.36"
|
|
62
|
+
),
|
|
63
|
+
"Accept": (
|
|
64
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,"
|
|
65
|
+
"image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"
|
|
66
|
+
),
|
|
67
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
68
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
69
|
+
"Sec-Ch-Ua": ('"Not A(Brand";v="99", "Google Chrome";v="122", "Chromium";v="122"'),
|
|
70
|
+
"Sec-Ch-Ua-Mobile": "?0",
|
|
71
|
+
"Sec-Ch-Ua-Platform": f'"{platform.system()}"',
|
|
72
|
+
"Sec-Fetch-Dest": "document",
|
|
73
|
+
"Sec-Fetch-Mode": "navigate",
|
|
74
|
+
"Sec-Fetch-Site": "none",
|
|
75
|
+
"Sec-Fetch-User": "?1",
|
|
76
|
+
"Upgrade-Insecure-Requests": "1",
|
|
77
|
+
"Priority": "u=0, i",
|
|
78
|
+
"Connection": "keep-alive",
|
|
79
|
+
"Cache-Control": "max-age=0",
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class BaseGoogleNewsClient(ABC):
|
|
84
|
+
"""Base class for Google News API clients."""
|
|
85
|
+
|
|
86
|
+
BASE_URL = "https://news.google.com/"
|
|
87
|
+
|
|
88
|
+
def __init__(
|
|
89
|
+
self,
|
|
90
|
+
language: str = "en",
|
|
91
|
+
country: str = "US",
|
|
92
|
+
requests_per_minute: int = 60,
|
|
93
|
+
cache_ttl: int = 300,
|
|
94
|
+
) -> None:
|
|
95
|
+
"""
|
|
96
|
+
Initialize the Google News API client.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
language (str): Language code (e.g., "en", "fr", "de") or
|
|
100
|
+
language-country format (e.g., "en-US", "fr-FR")
|
|
101
|
+
country (str): Country code (e.g., "US", "FR", "DE")
|
|
102
|
+
requests_per_minute (int): Number of requests per minute
|
|
103
|
+
cache_ttl (int): Cache time-to-live in seconds
|
|
104
|
+
"""
|
|
105
|
+
self._validate_language(language)
|
|
106
|
+
self._validate_country(country)
|
|
107
|
+
|
|
108
|
+
self.language_full = (
|
|
109
|
+
language.upper() if "-" in language else f"{language.upper()}-{country}"
|
|
110
|
+
)
|
|
111
|
+
self.language_base = language.split("-")[0].lower()
|
|
112
|
+
self.country = country.upper()
|
|
113
|
+
self._setup_rate_limiter_and_cache(requests_per_minute, cache_ttl)
|
|
114
|
+
|
|
115
|
+
@abstractmethod
|
|
116
|
+
def _setup_rate_limiter_and_cache(
|
|
117
|
+
self, requests_per_minute: int, cache_ttl: int
|
|
118
|
+
) -> None:
|
|
119
|
+
pass
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def _validate_language(language: str) -> None:
|
|
123
|
+
parts = language.split("-")
|
|
124
|
+
if len(parts) > 2 or len(parts[0]) != 2:
|
|
125
|
+
raise ConfigurationError(
|
|
126
|
+
"Language must be a two-letter ISO 639-1 "
|
|
127
|
+
"code or language-COUNTRY format",
|
|
128
|
+
field="language",
|
|
129
|
+
value=language,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def _validate_country(country: str) -> None:
|
|
134
|
+
if not isinstance(country, str) or len(country) != 2:
|
|
135
|
+
raise ConfigurationError(
|
|
136
|
+
"Country must be a two-letter ISO 3166-1 alpha-2 code",
|
|
137
|
+
field="country",
|
|
138
|
+
value=country,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def _validate_query(self, query: str) -> None:
|
|
142
|
+
if not query or not isinstance(query, str):
|
|
143
|
+
raise ValidationError(
|
|
144
|
+
"Query must be a non-empty string",
|
|
145
|
+
field="query",
|
|
146
|
+
value=query,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def _build_url(self, path: str) -> str:
|
|
150
|
+
if path.startswith("search"):
|
|
151
|
+
query = path.split("q=")[1] if "q=" in path else ""
|
|
152
|
+
base = f"{self.BASE_URL}rss/search"
|
|
153
|
+
params = {
|
|
154
|
+
"q": query.replace("+", " "),
|
|
155
|
+
"hl": self.language_full,
|
|
156
|
+
"gl": self.country,
|
|
157
|
+
"ceid": f"{self.country}:{self.language_base}",
|
|
158
|
+
}
|
|
159
|
+
return f"{base}?{urlencode(params)}"
|
|
160
|
+
|
|
161
|
+
elif not path:
|
|
162
|
+
base = f"{self.BASE_URL}rss/headlines/section/topic/WORLD"
|
|
163
|
+
params = {
|
|
164
|
+
"hl": self.language_full,
|
|
165
|
+
"gl": self.country,
|
|
166
|
+
"ceid": f"{self.country}:{self.language_base}",
|
|
167
|
+
}
|
|
168
|
+
return f"{base}?{urlencode(params)}"
|
|
169
|
+
|
|
170
|
+
elif path.startswith("topic/"):
|
|
171
|
+
base = f"{self.BASE_URL}rss/headlines/section/{path}"
|
|
172
|
+
params = {
|
|
173
|
+
"hl": self.language_full,
|
|
174
|
+
"gl": self.country,
|
|
175
|
+
"ceid": f"{self.country}:{self.language_base}",
|
|
176
|
+
}
|
|
177
|
+
return f"{base}?{urlencode(params)}"
|
|
178
|
+
|
|
179
|
+
base = f"{self.BASE_URL}rss/{path}"
|
|
180
|
+
params = {
|
|
181
|
+
"hl": self.language_full,
|
|
182
|
+
"gl": self.country,
|
|
183
|
+
"ceid": f"{self.country}:{self.language_base}",
|
|
184
|
+
}
|
|
185
|
+
return f"{base}?{urlencode(params)}"
|
|
186
|
+
|
|
187
|
+
def _parse_articles(
|
|
188
|
+
self, feed: FeedParserDict, max_results: Optional[int] = None
|
|
189
|
+
) -> List[Dict[str, Any]]:
|
|
190
|
+
articles = feed.entries[:max_results] if max_results else feed.entries
|
|
191
|
+
return [
|
|
192
|
+
{
|
|
193
|
+
"title": entry.title,
|
|
194
|
+
"link": entry.link,
|
|
195
|
+
"published": entry.published,
|
|
196
|
+
"summary": entry.get("summary", ""),
|
|
197
|
+
"source": entry.source.title if "source" in entry else None,
|
|
198
|
+
}
|
|
199
|
+
for entry in articles
|
|
200
|
+
]
|
|
201
|
+
|
|
202
|
+
def _get_topic_path(self, topic: str) -> str:
|
|
203
|
+
topic_map = {
|
|
204
|
+
"WORLD": "WORLD",
|
|
205
|
+
"NATION": "NATION",
|
|
206
|
+
"BUSINESS": "BUSINESS",
|
|
207
|
+
"TECHNOLOGY": "TECHNOLOGY",
|
|
208
|
+
"ENTERTAINMENT": "ENTERTAINMENT",
|
|
209
|
+
"SPORTS": "SPORTS",
|
|
210
|
+
"SCIENCE": "SCIENCE",
|
|
211
|
+
"HEALTH": "HEALTH",
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
topic = topic.upper()
|
|
215
|
+
if topic not in topic_map:
|
|
216
|
+
raise ValidationError(
|
|
217
|
+
f"Invalid topic. Must be one of: {', '.join(topic_map.keys())}",
|
|
218
|
+
field="topic",
|
|
219
|
+
value=topic,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
return f"topic/{topic_map[topic]}"
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class GoogleNewsClient(BaseGoogleNewsClient):
|
|
226
|
+
"""Synchronous client for Google News RSS feed API."""
|
|
227
|
+
|
|
228
|
+
def _setup_rate_limiter_and_cache(
|
|
229
|
+
self, requests_per_minute: int, cache_ttl: int
|
|
230
|
+
) -> None:
|
|
231
|
+
self.rate_limiter = RateLimiter(requests_per_minute)
|
|
232
|
+
self.cache = Cache(ttl=cache_ttl)
|
|
233
|
+
self.client = httpx.Client(
|
|
234
|
+
follow_redirects=True, timeout=30.0, headers=CHROME_HEADERS
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
def __del__(self) -> None:
|
|
238
|
+
"""Close the client."""
|
|
239
|
+
self.client.close()
|
|
240
|
+
|
|
241
|
+
@retry_sync(exceptions=(HTTPError, RateLimitError), max_retries=3, backoff=2.0)
|
|
242
|
+
def _fetch_feed(self, url: str) -> FeedParserDict:
|
|
243
|
+
cached = self.cache.get(url)
|
|
244
|
+
if cached is not None:
|
|
245
|
+
return cached
|
|
246
|
+
|
|
247
|
+
with self.rate_limiter:
|
|
248
|
+
try:
|
|
249
|
+
response = self.client.get(url)
|
|
250
|
+
|
|
251
|
+
if response.status_code == 429:
|
|
252
|
+
retry_after = float(response.headers.get("Retry-After", 60))
|
|
253
|
+
raise RateLimitError(
|
|
254
|
+
"Rate limit exceeded",
|
|
255
|
+
retry_after=retry_after,
|
|
256
|
+
response=response,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
if not response.is_success:
|
|
260
|
+
raise HTTPError(
|
|
261
|
+
f"HTTP {response.status_code}: {response.reason_phrase}",
|
|
262
|
+
status_code=response.status_code,
|
|
263
|
+
response_text=response.text,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
feed = feedparser.parse(response.text)
|
|
267
|
+
|
|
268
|
+
if feed.bozo:
|
|
269
|
+
raise ParsingError(
|
|
270
|
+
"Failed to parse feed",
|
|
271
|
+
data=response.text,
|
|
272
|
+
error=feed.bozo_exception,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
self.cache.set(url, feed)
|
|
276
|
+
return feed
|
|
277
|
+
|
|
278
|
+
except httpx.RequestError as e:
|
|
279
|
+
raise HTTPError(f"Request failed: {str(e)}")
|
|
280
|
+
|
|
281
|
+
def search(
|
|
282
|
+
self,
|
|
283
|
+
query: str,
|
|
284
|
+
*,
|
|
285
|
+
max_results: Optional[int] = None,
|
|
286
|
+
) -> List[Dict[str, Any]]:
|
|
287
|
+
"""Search for news articles."""
|
|
288
|
+
self._validate_query(query)
|
|
289
|
+
url = self._build_url(f"search?q={query}")
|
|
290
|
+
feed = self._fetch_feed(url)
|
|
291
|
+
return self._parse_articles(feed, max_results)
|
|
292
|
+
|
|
293
|
+
def top_news(
|
|
294
|
+
self,
|
|
295
|
+
topic: str = "WORLD",
|
|
296
|
+
*,
|
|
297
|
+
max_results: Optional[int] = None,
|
|
298
|
+
) -> List[Dict[str, Any]]:
|
|
299
|
+
"""Get top news articles for a topic."""
|
|
300
|
+
path = self._get_topic_path(topic)
|
|
301
|
+
url = self._build_url(path)
|
|
302
|
+
feed = self._fetch_feed(url)
|
|
303
|
+
return self._parse_articles(feed, max_results)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
class AsyncGoogleNewsClient(BaseGoogleNewsClient):
|
|
307
|
+
"""Asynchronous client for Google News RSS feed API."""
|
|
308
|
+
|
|
309
|
+
def _setup_rate_limiter_and_cache(
|
|
310
|
+
self, requests_per_minute: int, cache_ttl: int
|
|
311
|
+
) -> None:
|
|
312
|
+
self.rate_limiter = AsyncRateLimiter(requests_per_minute)
|
|
313
|
+
self.cache = AsyncCache(ttl=cache_ttl)
|
|
314
|
+
self.client = httpx.AsyncClient(
|
|
315
|
+
follow_redirects=True, timeout=30.0, headers=CHROME_HEADERS
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
async def __aenter__(self) -> "AsyncGoogleNewsClient":
|
|
319
|
+
"""Enter the context manager."""
|
|
320
|
+
return self
|
|
321
|
+
|
|
322
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
323
|
+
"""Exit the context manager."""
|
|
324
|
+
await self.client.aclose()
|
|
325
|
+
|
|
326
|
+
async def aclose(self) -> None:
|
|
327
|
+
"""Close the client."""
|
|
328
|
+
await self.client.aclose()
|
|
329
|
+
|
|
330
|
+
@retry_async(exceptions=(HTTPError, RateLimitError), max_retries=3, backoff=2.0)
|
|
331
|
+
async def _fetch_feed(self, url: str) -> FeedParserDict:
|
|
332
|
+
cached = await self.cache.get(url)
|
|
333
|
+
if cached is not None:
|
|
334
|
+
return cached
|
|
335
|
+
|
|
336
|
+
async with self.rate_limiter:
|
|
337
|
+
try:
|
|
338
|
+
response = await self.client.get(url)
|
|
339
|
+
|
|
340
|
+
if response.status_code == 429:
|
|
341
|
+
retry_after = float(response.headers.get("Retry-After", 60))
|
|
342
|
+
raise RateLimitError(
|
|
343
|
+
"Rate limit exceeded",
|
|
344
|
+
retry_after=retry_after,
|
|
345
|
+
response=response,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
if not response.is_success:
|
|
349
|
+
raise HTTPError(
|
|
350
|
+
f"HTTP {response.status_code}: {response.reason_phrase}",
|
|
351
|
+
status_code=response.status_code,
|
|
352
|
+
response_text=response.text,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
feed = feedparser.parse(response.text)
|
|
356
|
+
|
|
357
|
+
if feed.bozo:
|
|
358
|
+
raise ParsingError(
|
|
359
|
+
"Failed to parse feed",
|
|
360
|
+
data=response.text,
|
|
361
|
+
error=feed.bozo_exception,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
await self.cache.set(url, feed)
|
|
365
|
+
return feed
|
|
366
|
+
|
|
367
|
+
except httpx.RequestError as e:
|
|
368
|
+
raise HTTPError(f"Request failed: {str(e)}")
|
|
369
|
+
|
|
370
|
+
async def search(
|
|
371
|
+
self,
|
|
372
|
+
query: str,
|
|
373
|
+
*,
|
|
374
|
+
max_results: Optional[int] = None,
|
|
375
|
+
) -> List[Dict[str, Any]]:
|
|
376
|
+
"""Search for news articles asynchronously."""
|
|
377
|
+
self._validate_query(query)
|
|
378
|
+
url = self._build_url(f"search?q={query}")
|
|
379
|
+
feed = await self._fetch_feed(url)
|
|
380
|
+
return self._parse_articles(feed, max_results)
|
|
381
|
+
|
|
382
|
+
async def top_news(
|
|
383
|
+
self,
|
|
384
|
+
topic: str = "WORLD",
|
|
385
|
+
*,
|
|
386
|
+
max_results: Optional[int] = None,
|
|
387
|
+
) -> List[Dict[str, Any]]:
|
|
388
|
+
"""Get top news articles for a topic asynchronously."""
|
|
389
|
+
path = self._get_topic_path(topic)
|
|
390
|
+
url = self._build_url(path)
|
|
391
|
+
feed = await self._fetch_feed(url)
|
|
392
|
+
return self._parse_articles(feed, max_results)
|