supadata 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- supadata-1.0.0/.gitignore +60 -0
- supadata-1.0.0/LICENSE +21 -0
- supadata-1.0.0/PKG-INFO +97 -0
- supadata-1.0.0/README.md +76 -0
- supadata-1.0.0/pyproject.toml +30 -0
- supadata-1.0.0/supadata/__init__.py +26 -0
- supadata-1.0.0/supadata/client.py +164 -0
- supadata-1.0.0/supadata/errors.py +31 -0
- supadata-1.0.0/supadata/types.py +95 -0
- supadata-1.0.0/tests/__init__.py +1 -0
- supadata-1.0.0/tests/test_client.py +180 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
**/__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
**/build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
**/dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
**/*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
MANIFEST
|
|
23
|
+
|
|
24
|
+
# Virtual Environment
|
|
25
|
+
.env
|
|
26
|
+
**/.venv/
|
|
27
|
+
**/env/
|
|
28
|
+
**/venv/
|
|
29
|
+
ENV/
|
|
30
|
+
env.bak/
|
|
31
|
+
venv.bak/
|
|
32
|
+
|
|
33
|
+
# IDE
|
|
34
|
+
**/.idea/
|
|
35
|
+
**/.vscode/
|
|
36
|
+
*.swp
|
|
37
|
+
*.swo
|
|
38
|
+
.DS_Store
|
|
39
|
+
|
|
40
|
+
# Testing
|
|
41
|
+
.coverage
|
|
42
|
+
**/.pytest_cache/
|
|
43
|
+
**/htmlcov/
|
|
44
|
+
**/.tox/
|
|
45
|
+
**/.nox/
|
|
46
|
+
coverage.xml
|
|
47
|
+
*.cover
|
|
48
|
+
*.py,cover
|
|
49
|
+
**/.hypothesis/
|
|
50
|
+
|
|
51
|
+
# Distribution / packaging
|
|
52
|
+
.Python
|
|
53
|
+
*.manifest
|
|
54
|
+
*.spec
|
|
55
|
+
|
|
56
|
+
# Jupyter Notebook
|
|
57
|
+
**/.ipynb_checkpoints
|
|
58
|
+
|
|
59
|
+
# Logs
|
|
60
|
+
*.log
|
supadata-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Supadata
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
supadata-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: supadata
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: The official Python SDK for Supadata - scrape web content and YouTube transcripts with ease
|
|
5
|
+
Project-URL: homepage, https://supadata.ai
|
|
6
|
+
Project-URL: repository, https://github.com/supadata/supadata-py
|
|
7
|
+
Project-URL: documentation, https://supadata.ai/documentation
|
|
8
|
+
Author-email: Supadata <support@supadata.ai>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: ai,api,llm,supadata,transcripts,web-scraping,youtube
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Requires-Python: >=3.7
|
|
16
|
+
Requires-Dist: requests>=2.28.1
|
|
17
|
+
Provides-Extra: test
|
|
18
|
+
Requires-Dist: pytest>=7.0.0; extra == 'test'
|
|
19
|
+
Requires-Dist: requests-mock>=1.11.0; extra == 'test'
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# Supadata Python SDK
|
|
23
|
+
|
|
24
|
+
[](https://badge.fury.io/py/supadata)
|
|
25
|
+
[](http://opensource.org/licenses/MIT)
|
|
26
|
+
|
|
27
|
+
The official Python SDK for Supadata.
|
|
28
|
+
|
|
29
|
+
Get your free API key at [supadata.ai](https://supadata.ai) and start scraping data in minutes.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install supadata
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from supadata import Supadata
|
|
41
|
+
|
|
42
|
+
# Initialize the client
|
|
43
|
+
client = Supadata(api_key="YOUR_API_KEY")
|
|
44
|
+
|
|
45
|
+
# Get YouTube transcript
|
|
46
|
+
transcript = client.get_transcript(video_id="VIDEO_ID")
|
|
47
|
+
print(f"Got transcript in {transcript['lang']}")
|
|
48
|
+
|
|
49
|
+
# Translate YouTube transcript to Spanish
|
|
50
|
+
translated = client.translate_transcript(
|
|
51
|
+
video_id="VIDEO_ID",
|
|
52
|
+
lang="es"
|
|
53
|
+
)
|
|
54
|
+
print(f"Got translated transcript in {translated['lang']}")
|
|
55
|
+
|
|
56
|
+
# Get plain text transcript
|
|
57
|
+
text_transcript = client.get_transcript(
|
|
58
|
+
video_id="VIDEO_ID",
|
|
59
|
+
text=True
|
|
60
|
+
)
|
|
61
|
+
print(text_transcript['content'])
|
|
62
|
+
|
|
63
|
+
# Scrape web content
|
|
64
|
+
web_content = client.scrape("https://supadata.ai")
|
|
65
|
+
print(f"Page title: {web_content['name']}")
|
|
66
|
+
print(f"Content length: {web_content['countCharacters']} characters")
|
|
67
|
+
|
|
68
|
+
# Map website URLs
|
|
69
|
+
site_map = client.map("https://supadata.ai")
|
|
70
|
+
print(f"Found {len(site_map['urls'])} URLs")
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Error Handling
|
|
74
|
+
|
|
75
|
+
The SDK uses the standard `requests` library and will raise `requests.exceptions.RequestException` for API-related errors:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from requests.exceptions import RequestException
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
transcript = client.get_transcript(video_id="INVALID_ID")
|
|
82
|
+
except RequestException as error:
|
|
83
|
+
print(f"API request failed: {error}")
|
|
84
|
+
if error.response is not None:
|
|
85
|
+
error_data = error.response.json()
|
|
86
|
+
print(f"Error code: {error_data.get('code')}")
|
|
87
|
+
print(f"Error title: {error_data.get('title')}")
|
|
88
|
+
print(f"Error description: {error_data.get('description')}")
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## API Reference
|
|
92
|
+
|
|
93
|
+
See the [Documentation](https://supadata.ai/documentation) for more details on all possible parameters and options.
|
|
94
|
+
|
|
95
|
+
## License
|
|
96
|
+
|
|
97
|
+
MIT
|
supadata-1.0.0/README.md
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Supadata Python SDK
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/py/supadata)
|
|
4
|
+
[](http://opensource.org/licenses/MIT)
|
|
5
|
+
|
|
6
|
+
The official Python SDK for Supadata.
|
|
7
|
+
|
|
8
|
+
Get your free API key at [supadata.ai](https://supadata.ai) and start scraping data in minutes.
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install supadata
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
from supadata import Supadata
|
|
20
|
+
|
|
21
|
+
# Initialize the client
|
|
22
|
+
client = Supadata(api_key="YOUR_API_KEY")
|
|
23
|
+
|
|
24
|
+
# Get YouTube transcript
|
|
25
|
+
transcript = client.get_transcript(video_id="VIDEO_ID")
|
|
26
|
+
print(f"Got transcript in {transcript['lang']}")
|
|
27
|
+
|
|
28
|
+
# Translate YouTube transcript to Spanish
|
|
29
|
+
translated = client.translate_transcript(
|
|
30
|
+
video_id="VIDEO_ID",
|
|
31
|
+
lang="es"
|
|
32
|
+
)
|
|
33
|
+
print(f"Got translated transcript in {translated['lang']}")
|
|
34
|
+
|
|
35
|
+
# Get plain text transcript
|
|
36
|
+
text_transcript = client.get_transcript(
|
|
37
|
+
video_id="VIDEO_ID",
|
|
38
|
+
text=True
|
|
39
|
+
)
|
|
40
|
+
print(text_transcript['content'])
|
|
41
|
+
|
|
42
|
+
# Scrape web content
|
|
43
|
+
web_content = client.scrape("https://supadata.ai")
|
|
44
|
+
print(f"Page title: {web_content['name']}")
|
|
45
|
+
print(f"Content length: {web_content['countCharacters']} characters")
|
|
46
|
+
|
|
47
|
+
# Map website URLs
|
|
48
|
+
site_map = client.map("https://supadata.ai")
|
|
49
|
+
print(f"Found {len(site_map['urls'])} URLs")
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Error Handling
|
|
53
|
+
|
|
54
|
+
The SDK uses the standard `requests` library and will raise `requests.exceptions.RequestException` for API-related errors:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from requests.exceptions import RequestException
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
transcript = client.get_transcript(video_id="INVALID_ID")
|
|
61
|
+
except RequestException as error:
|
|
62
|
+
print(f"API request failed: {error}")
|
|
63
|
+
if error.response is not None:
|
|
64
|
+
error_data = error.response.json()
|
|
65
|
+
print(f"Error code: {error_data.get('code')}")
|
|
66
|
+
print(f"Error title: {error_data.get('title')}")
|
|
67
|
+
print(f"Error description: {error_data.get('description')}")
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## API Reference
|
|
71
|
+
|
|
72
|
+
See the [Documentation](https://supadata.ai/documentation) for more details on all possible parameters and options.
|
|
73
|
+
|
|
74
|
+
## License
|
|
75
|
+
|
|
76
|
+
MIT
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "supadata"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
authors = [{ name = "Supadata", email = "support@supadata.ai" }]
|
|
9
|
+
dependencies = ["requests >= 2.28.1"]
|
|
10
|
+
description = "The official Python SDK for Supadata - scrape web content and YouTube transcripts with ease"
|
|
11
|
+
readme = "README.md"
|
|
12
|
+
requires-python = ">=3.7"
|
|
13
|
+
license = "MIT"
|
|
14
|
+
keywords = ["supadata", "web-scraping", "youtube", "transcripts", "api", "llm", "ai"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[project.urls]
|
|
22
|
+
homepage = "https://supadata.ai"
|
|
23
|
+
repository = "https://github.com/supadata/supadata-py"
|
|
24
|
+
documentation = "https://supadata.ai/documentation"
|
|
25
|
+
|
|
26
|
+
[project.optional-dependencies]
|
|
27
|
+
test = [
|
|
28
|
+
"pytest >= 7.0.0",
|
|
29
|
+
"requests-mock >= 1.11.0",
|
|
30
|
+
]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Supadata Python SDK
|
|
3
|
+
|
|
4
|
+
The official Python SDK for Supadata - scrape web and YouTube content with ease.
|
|
5
|
+
"""
|
|
6
|
+
from importlib.metadata import version
|
|
7
|
+
from supadata.client import Supadata
|
|
8
|
+
from supadata.types import (
|
|
9
|
+
Transcript,
|
|
10
|
+
TranslatedTranscript,
|
|
11
|
+
TranscriptChunk,
|
|
12
|
+
Scrape,
|
|
13
|
+
Map,
|
|
14
|
+
Error,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__version__ = version("supadata")
|
|
18
|
+
__all__ = [
|
|
19
|
+
"Supadata",
|
|
20
|
+
"Transcript",
|
|
21
|
+
"TranslatedTranscript",
|
|
22
|
+
"TranscriptChunk",
|
|
23
|
+
"Scrape",
|
|
24
|
+
"Map",
|
|
25
|
+
"Error",
|
|
26
|
+
]
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Main Supadata client implementation."""
|
|
2
|
+
|
|
3
|
+
from typing import Dict, Any
|
|
4
|
+
import requests
|
|
5
|
+
from dataclasses import asdict
|
|
6
|
+
|
|
7
|
+
from .types import (
|
|
8
|
+
Transcript,
|
|
9
|
+
TranslatedTranscript,
|
|
10
|
+
TranscriptChunk,
|
|
11
|
+
Scrape,
|
|
12
|
+
Map,
|
|
13
|
+
Error,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Supadata:
|
|
18
|
+
"""Main Supadata client."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, api_key: str, base_url: str = "https://api.supadata.ai/v1"):
|
|
21
|
+
"""Initialize Supadata client.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
api_key: Your Supadata API key
|
|
25
|
+
base_url: Optional custom API base URL
|
|
26
|
+
"""
|
|
27
|
+
self.base_url = base_url
|
|
28
|
+
self.session = requests.Session()
|
|
29
|
+
self.session.headers.update({
|
|
30
|
+
"x-api-key": api_key,
|
|
31
|
+
"Accept": "application/json"
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
def get_transcript(self, video_id: str, text: bool = False) -> Transcript:
|
|
35
|
+
"""Get transcript for a YouTube video.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
video_id: YouTube video ID
|
|
39
|
+
text: Whether to return plain text instead of segments
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Transcript object containing content, language and available languages
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
requests.exceptions.RequestException: If the API request fails
|
|
46
|
+
"""
|
|
47
|
+
response = self._request("GET", "/youtube/transcript", params={
|
|
48
|
+
"videoId": video_id,
|
|
49
|
+
"text": text
|
|
50
|
+
})
|
|
51
|
+
|
|
52
|
+
# Convert chunks if present
|
|
53
|
+
if not text and isinstance(response["content"], list):
|
|
54
|
+
response["content"] = [
|
|
55
|
+
TranscriptChunk(**chunk) for chunk in response["content"]
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
return Transcript(**response)
|
|
59
|
+
|
|
60
|
+
def translate_transcript(
|
|
61
|
+
self,
|
|
62
|
+
video_id: str,
|
|
63
|
+
lang: str,
|
|
64
|
+
text: bool = False
|
|
65
|
+
) -> TranslatedTranscript:
|
|
66
|
+
"""Get translated transcript for a YouTube video.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
video_id: YouTube video ID
|
|
70
|
+
lang: Target language code (e.g., 'es' for Spanish)
|
|
71
|
+
text: Whether to return plain text instead of segments
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
TranslatedTranscript object containing translated content
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
requests.exceptions.RequestException: If the API request fails
|
|
78
|
+
"""
|
|
79
|
+
response = self._request("GET", "/youtube/transcript/translate", params={
|
|
80
|
+
"videoId": video_id,
|
|
81
|
+
"lang": lang,
|
|
82
|
+
"text": text
|
|
83
|
+
})
|
|
84
|
+
|
|
85
|
+
# Convert chunks if present
|
|
86
|
+
if not text and isinstance(response["content"], list):
|
|
87
|
+
response["content"] = [
|
|
88
|
+
TranscriptChunk(**chunk) for chunk in response["content"]
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
return TranslatedTranscript(**response)
|
|
92
|
+
|
|
93
|
+
def scrape(self, url: str) -> Scrape:
|
|
94
|
+
"""Scrape content from a web page.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
url: URL to scrape
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Scrape object containing the extracted content
|
|
101
|
+
|
|
102
|
+
Raises:
|
|
103
|
+
requests.exceptions.RequestException: If the API request fails
|
|
104
|
+
"""
|
|
105
|
+
response = self._request("GET", "/web/scrape", params={"url": url})
|
|
106
|
+
return Scrape(**response)
|
|
107
|
+
|
|
108
|
+
def map(self, url: str) -> Map:
|
|
109
|
+
"""Generate a site map for a website.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
url: Base URL to map
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Map object containing discovered URLs
|
|
116
|
+
|
|
117
|
+
Raises:
|
|
118
|
+
requests.exceptions.RequestException: If the API request fails
|
|
119
|
+
"""
|
|
120
|
+
response = self._request("GET", "/web/map", params={"url": url})
|
|
121
|
+
return Map(**response)
|
|
122
|
+
|
|
123
|
+
def _camel_to_snake(self, d: Dict[str, Any]) -> Dict[str, Any]:
|
|
124
|
+
"""Convert dictionary keys from camelCase to snake_case."""
|
|
125
|
+
import re
|
|
126
|
+
def convert(name: str) -> str:
|
|
127
|
+
name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
|
|
128
|
+
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
|
|
129
|
+
|
|
130
|
+
if isinstance(d, dict):
|
|
131
|
+
return {convert(k): self._camel_to_snake(v) for k, v in d.items()}
|
|
132
|
+
if isinstance(d, list):
|
|
133
|
+
return [self._camel_to_snake(i) for i in d]
|
|
134
|
+
return d
|
|
135
|
+
|
|
136
|
+
def _request(self, method: str, path: str, **kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
|
137
|
+
"""Make an HTTP request to the Supadata API.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
method: HTTP method
|
|
141
|
+
path: API endpoint path
|
|
142
|
+
**kwargs: Additional arguments to pass to requests
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
dict: Parsed JSON response
|
|
146
|
+
|
|
147
|
+
Raises:
|
|
148
|
+
requests.exceptions.RequestException: If the API request fails
|
|
149
|
+
"""
|
|
150
|
+
url = f"{self.base_url}{path}"
|
|
151
|
+
response = self.session.request(method, url, **kwargs)
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
response.raise_for_status()
|
|
155
|
+
return self._camel_to_snake(response.json())
|
|
156
|
+
except requests.exceptions.HTTPError as e:
|
|
157
|
+
if e.response is not None:
|
|
158
|
+
try:
|
|
159
|
+
error_data = self._camel_to_snake(e.response.json())
|
|
160
|
+
error = Error(**error_data)
|
|
161
|
+
raise requests.exceptions.HTTPError(error) from e
|
|
162
|
+
except (ValueError, TypeError):
|
|
163
|
+
pass
|
|
164
|
+
raise
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Custom exceptions for Supadata SDK."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class SupadataError(Exception):
|
|
9
|
+
"""Base exception for all Supadata errors.
|
|
10
|
+
|
|
11
|
+
Attributes:
|
|
12
|
+
code: Error code identifying the type of error (e.g., 'video-not-found')
|
|
13
|
+
title: Human readable error title
|
|
14
|
+
description: Detailed error description
|
|
15
|
+
documentation_url: URL to error documentation
|
|
16
|
+
"""
|
|
17
|
+
code: str
|
|
18
|
+
title: str
|
|
19
|
+
description: str
|
|
20
|
+
documentation_url: Optional[str] = None
|
|
21
|
+
|
|
22
|
+
def __str__(self) -> str:
|
|
23
|
+
"""Return string representation of the error."""
|
|
24
|
+
parts = [self.description]
|
|
25
|
+
if self.code:
|
|
26
|
+
parts.append(f"Code: {self.code}")
|
|
27
|
+
if self.title:
|
|
28
|
+
parts.append(f"Title: {self.title}")
|
|
29
|
+
if self.documentation_url:
|
|
30
|
+
parts.append(f"Documentation: {self.documentation_url}")
|
|
31
|
+
return " | ".join(parts)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Type definitions for Supadata API responses."""
|
|
2
|
+
|
|
3
|
+
from typing import List, Optional, Union
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class TranscriptChunk:
|
|
9
|
+
"""A chunk of a video transcript.
|
|
10
|
+
|
|
11
|
+
Attributes:
|
|
12
|
+
text: Transcript segment text
|
|
13
|
+
offset: Start time in milliseconds
|
|
14
|
+
duration: Duration in milliseconds
|
|
15
|
+
lang: ISO 639-1 language code of chunk
|
|
16
|
+
"""
|
|
17
|
+
text: str
|
|
18
|
+
offset: int
|
|
19
|
+
duration: int
|
|
20
|
+
lang: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class Transcript:
|
|
25
|
+
"""A complete video transcript.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
content: List of transcript chunks or plain text when text=true
|
|
29
|
+
lang: ISO 639-1 language code of transcript
|
|
30
|
+
available_langs: List of available language codes
|
|
31
|
+
"""
|
|
32
|
+
content: Union[List[TranscriptChunk], str]
|
|
33
|
+
lang: str
|
|
34
|
+
available_langs: List[str]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class TranslatedTranscript:
|
|
39
|
+
"""A translated video transcript.
|
|
40
|
+
|
|
41
|
+
Attributes:
|
|
42
|
+
content: List of transcript chunks or plain text when text=true
|
|
43
|
+
lang: ISO 639-1 language code of translation
|
|
44
|
+
"""
|
|
45
|
+
content: Union[List[TranscriptChunk], str]
|
|
46
|
+
lang: str
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class Scrape:
|
|
51
|
+
"""Scraped web content.
|
|
52
|
+
|
|
53
|
+
Attributes:
|
|
54
|
+
url: The URL that was scraped
|
|
55
|
+
content: The Markdown content extracted from the URL
|
|
56
|
+
name: The name of the webpage
|
|
57
|
+
description: A description of the webpage
|
|
58
|
+
og_url: Open Graph URL for the webpage
|
|
59
|
+
count_characters: The number of characters in the content
|
|
60
|
+
urls: List of URLs found on the webpage
|
|
61
|
+
"""
|
|
62
|
+
url: str
|
|
63
|
+
content: str
|
|
64
|
+
name: str
|
|
65
|
+
description: str
|
|
66
|
+
og_url: Optional[str]
|
|
67
|
+
count_characters: int
|
|
68
|
+
urls: List[str]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class Map:
|
|
73
|
+
"""A site map containing URLs.
|
|
74
|
+
|
|
75
|
+
Attributes:
|
|
76
|
+
urls: List of URLs found on the webpage
|
|
77
|
+
"""
|
|
78
|
+
urls: List[str]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class Error:
|
|
83
|
+
"""Standard error response format.
|
|
84
|
+
|
|
85
|
+
Attributes:
|
|
86
|
+
code: Error code identifying the type of error
|
|
87
|
+
title: Human readable error title
|
|
88
|
+
description: Detailed error description
|
|
89
|
+
documentation_url: URL to error documentation
|
|
90
|
+
"""
|
|
91
|
+
code: str
|
|
92
|
+
title: str
|
|
93
|
+
description: str
|
|
94
|
+
documentation_url: Optional[str]
|
|
95
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Tests for the Supadata package."""
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Tests for the Supadata client."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
import requests
|
|
5
|
+
from requests import Response
|
|
6
|
+
|
|
7
|
+
from supadata import (
|
|
8
|
+
Supadata,
|
|
9
|
+
Transcript,
|
|
10
|
+
TranslatedTranscript,
|
|
11
|
+
TranscriptChunk,
|
|
12
|
+
Scrape,
|
|
13
|
+
Map,
|
|
14
|
+
Error,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pytest.fixture
|
|
19
|
+
def api_key() -> str:
|
|
20
|
+
"""Return a dummy API key for testing."""
|
|
21
|
+
return "test_api_key"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@pytest.fixture
|
|
25
|
+
def base_url() -> str:
|
|
26
|
+
"""Return a dummy base URL for testing."""
|
|
27
|
+
return "https://api.test.com/v1"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@pytest.fixture
|
|
31
|
+
def client(api_key: str, base_url: str) -> Supadata:
|
|
32
|
+
"""Return a configured Supadata client."""
|
|
33
|
+
return Supadata(api_key=api_key, base_url=base_url)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_client_initialization(api_key: str, base_url: str) -> None:
|
|
37
|
+
"""Test client initialization."""
|
|
38
|
+
client = Supadata(api_key=api_key, base_url=base_url)
|
|
39
|
+
assert client.base_url == base_url
|
|
40
|
+
assert client.session.headers["x-api-key"] == api_key
|
|
41
|
+
assert client.session.headers["Accept"] == "application/json"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_get_transcript_chunks(client: Supadata, requests_mock) -> None:
|
|
45
|
+
"""Test getting YouTube transcript with chunks."""
|
|
46
|
+
video_id = "test123"
|
|
47
|
+
mock_response = {
|
|
48
|
+
"content": [
|
|
49
|
+
{
|
|
50
|
+
"text": "Hello",
|
|
51
|
+
"offset": 0,
|
|
52
|
+
"duration": 1000,
|
|
53
|
+
"lang": "en"
|
|
54
|
+
}
|
|
55
|
+
],
|
|
56
|
+
"lang": "en",
|
|
57
|
+
"availableLangs": ["en", "es"]
|
|
58
|
+
}
|
|
59
|
+
requests_mock.get(
|
|
60
|
+
f"{client.base_url}/youtube/transcript",
|
|
61
|
+
json=mock_response
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
transcript = client.get_transcript(video_id=video_id)
|
|
65
|
+
assert isinstance(transcript, Transcript)
|
|
66
|
+
assert isinstance(transcript.content[0], TranscriptChunk)
|
|
67
|
+
assert transcript.content[0].text == "Hello"
|
|
68
|
+
assert transcript.lang == "en"
|
|
69
|
+
assert transcript.available_langs == ["en", "es"]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_get_transcript_text(client: Supadata, requests_mock) -> None:
|
|
73
|
+
"""Test getting YouTube transcript as plain text."""
|
|
74
|
+
video_id = "test123"
|
|
75
|
+
mock_response = {
|
|
76
|
+
"content": "Hello, this is a test transcript",
|
|
77
|
+
"lang": "en",
|
|
78
|
+
"availableLangs": ["en", "es"]
|
|
79
|
+
}
|
|
80
|
+
requests_mock.get(
|
|
81
|
+
f"{client.base_url}/youtube/transcript",
|
|
82
|
+
json=mock_response
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
transcript = client.get_transcript(video_id=video_id, text=True)
|
|
86
|
+
assert isinstance(transcript, Transcript)
|
|
87
|
+
assert isinstance(transcript.content, str)
|
|
88
|
+
assert transcript.content == "Hello, this is a test transcript"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def test_translate_transcript(client: Supadata, requests_mock) -> None:
|
|
92
|
+
"""Test translating YouTube transcript."""
|
|
93
|
+
video_id = "test123"
|
|
94
|
+
mock_response = {
|
|
95
|
+
"content": "Hola, esto es una prueba",
|
|
96
|
+
"lang": "es"
|
|
97
|
+
}
|
|
98
|
+
requests_mock.get(
|
|
99
|
+
f"{client.base_url}/youtube/transcript/translate",
|
|
100
|
+
json=mock_response
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
transcript = client.translate_transcript(
|
|
104
|
+
video_id=video_id,
|
|
105
|
+
lang="es",
|
|
106
|
+
text=True
|
|
107
|
+
)
|
|
108
|
+
assert isinstance(transcript, TranslatedTranscript)
|
|
109
|
+
assert transcript.content == "Hola, esto es una prueba"
|
|
110
|
+
assert transcript.lang == "es"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def test_scrape(client: Supadata, requests_mock) -> None:
|
|
114
|
+
"""Test web scraping."""
|
|
115
|
+
url = "https://test.com"
|
|
116
|
+
mock_response = {
|
|
117
|
+
"url": url,
|
|
118
|
+
"content": "# Test\nThis is a test page",
|
|
119
|
+
"name": "Test Page",
|
|
120
|
+
"description": "A test page",
|
|
121
|
+
"ogUrl": "https://test.com/og.png",
|
|
122
|
+
"countCharacters": 100,
|
|
123
|
+
"urls": ["https://test.com/about"]
|
|
124
|
+
}
|
|
125
|
+
requests_mock.get(
|
|
126
|
+
f"{client.base_url}/web/scrape",
|
|
127
|
+
json=mock_response
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
content = client.scrape(url=url)
|
|
131
|
+
assert isinstance(content, Scrape)
|
|
132
|
+
assert content.url == url
|
|
133
|
+
assert content.name == "Test Page"
|
|
134
|
+
assert content.og_url == "https://test.com/og.png"
|
|
135
|
+
assert content.count_characters == 100
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def test_map(client: Supadata, requests_mock) -> None:
|
|
139
|
+
"""Test site mapping."""
|
|
140
|
+
url = "https://test.com"
|
|
141
|
+
mock_response = {
|
|
142
|
+
"urls": [
|
|
143
|
+
"https://test.com",
|
|
144
|
+
"https://test.com/about"
|
|
145
|
+
]
|
|
146
|
+
}
|
|
147
|
+
requests_mock.get(
|
|
148
|
+
f"{client.base_url}/web/map",
|
|
149
|
+
json=mock_response
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
site_map = client.map(url=url)
|
|
153
|
+
assert isinstance(site_map, Map)
|
|
154
|
+
assert len(site_map.urls) == 2
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_error_handling(client: Supadata, requests_mock) -> None:
|
|
158
|
+
"""Test error handling."""
|
|
159
|
+
video_id = "invalid"
|
|
160
|
+
error_response = {
|
|
161
|
+
"code": "video-not-found",
|
|
162
|
+
"title": "Video Not Found",
|
|
163
|
+
"description": "The specified video was not found",
|
|
164
|
+
"documentationUrl": "https://docs.test.com/errors#video-not-found"
|
|
165
|
+
}
|
|
166
|
+
requests_mock.get(
|
|
167
|
+
f"{client.base_url}/youtube/transcript",
|
|
168
|
+
status_code=404,
|
|
169
|
+
json=error_response
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
with pytest.raises(requests.exceptions.HTTPError) as exc_info:
|
|
173
|
+
client.get_transcript(video_id=video_id)
|
|
174
|
+
|
|
175
|
+
error = exc_info.value.args[0]
|
|
176
|
+
assert isinstance(error, Error)
|
|
177
|
+
assert error.code == error_response["code"]
|
|
178
|
+
assert error.title == error_response["title"]
|
|
179
|
+
assert error.description == error_response["description"]
|
|
180
|
+
assert error.documentation_url == error_response["documentationUrl"]
|