instagram-archiver 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of instagram-archiver might be problematic. Click here for more details.
- instagram_archiver-0.3.0/LICENSE.txt +18 -0
- instagram_archiver-0.3.0/PKG-INFO +119 -0
- instagram_archiver-0.3.0/README.md +89 -0
- instagram_archiver-0.3.0/instagram_archiver/__init__.py +9 -0
- instagram_archiver-0.3.0/instagram_archiver/__main__.py +6 -0
- instagram_archiver-0.3.0/instagram_archiver/client.py +268 -0
- instagram_archiver-0.3.0/instagram_archiver/constants.py +63 -0
- instagram_archiver-0.3.0/instagram_archiver/main.py +98 -0
- instagram_archiver-0.3.0/instagram_archiver/profile_scraper.py +194 -0
- instagram_archiver-0.3.0/instagram_archiver/py.typed +0 -0
- instagram_archiver-0.3.0/instagram_archiver/saved_scraper.py +78 -0
- instagram_archiver-0.3.0/instagram_archiver/typing.py +170 -0
- instagram_archiver-0.3.0/instagram_archiver/utils.py +134 -0
- instagram_archiver-0.3.0/man/instagram-archiver.1 +161 -0
- instagram_archiver-0.3.0/pyproject.toml +343 -0
- instagram_archiver-0.2.0/LICENSE.txt +0 -21
- instagram_archiver-0.2.0/PKG-INFO +0 -37
- instagram_archiver-0.2.0/README.md +0 -17
- instagram_archiver-0.2.0/instagram_archiver/__init__.py +0 -3
- instagram_archiver-0.2.0/instagram_archiver/client.py +0 -309
- instagram_archiver-0.2.0/instagram_archiver/constants.py +0 -103
- instagram_archiver-0.2.0/instagram_archiver/ig_typing.py +0 -117
- instagram_archiver-0.2.0/instagram_archiver/main.py +0 -66
- instagram_archiver-0.2.0/instagram_archiver/utils.py +0 -110
- instagram_archiver-0.2.0/pyproject.toml +0 -45
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 instagram-archiver authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
|
|
6
|
+
associated documentation files (the "Software"), to deal in the Software without restriction,
|
|
7
|
+
including without limitation the rights to use, copy, modify, merge, publish, distribute,
|
|
8
|
+
sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
|
|
9
|
+
furnished to do so, subject to the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be included in all copies or
|
|
12
|
+
substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
|
|
15
|
+
NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
16
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
|
17
|
+
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
|
|
18
|
+
OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: instagram-archiver
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Save Instagram content you have access to.
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: command line,instagram
|
|
7
|
+
Author: Andrew Udvare
|
|
8
|
+
Author-email: audvare@gmail.com
|
|
9
|
+
Requires-Python: >=3.12,<3.14
|
|
10
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Typing :: Typed
|
|
17
|
+
Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
|
|
18
|
+
Requires-Dist: click (>=8.2.0,<9.0.0)
|
|
19
|
+
Requires-Dist: colorlog (>=6.9.0,<7.0.0)
|
|
20
|
+
Requires-Dist: html5lib (>=1.1,<2.0)
|
|
21
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
|
22
|
+
Requires-Dist: typing-extensions (>=4.13.1,<5.0.0)
|
|
23
|
+
Requires-Dist: yt-dlp-utils (>=0,<1)
|
|
24
|
+
Project-URL: Documentation, https://instagram-archiver.readthedocs.org
|
|
25
|
+
Project-URL: Homepage, https://tatsh.github.io/instagram-archiver/
|
|
26
|
+
Project-URL: Issues, https://github.com/Tatsh/instagram-archiver/issues
|
|
27
|
+
Project-URL: Repository, https://github.com/Tatsh/instagram-archiver
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# instagram-archiver
|
|
31
|
+
|
|
32
|
+
[](https://www.python.org/)
|
|
33
|
+
[](https://pypi.org/project/instagram-archiver/)
|
|
34
|
+
[](https://github.com/Tatsh/instagram-archiver/tags)
|
|
35
|
+
[](https://github.com/Tatsh/instagram-archiver/blob/master/LICENSE.txt)
|
|
36
|
+
[](https://github.com/Tatsh/instagram-archiver/compare/v0.3.0...master)
|
|
37
|
+
[](https://github.com/Tatsh/instagram-archiver/actions/workflows/qa.yml)
|
|
38
|
+
[](https://github.com/Tatsh/instagram-archiver/actions/workflows/tests.yml)
|
|
39
|
+
[](https://coveralls.io/github/Tatsh/instagram-archiver?branch=master)
|
|
40
|
+
[](https://instagram-archiver.readthedocs.org/?badge=latest)
|
|
41
|
+
[](http://mypy-lang.org/)
|
|
42
|
+
[](https://github.com/pre-commit/pre-commit)
|
|
43
|
+
[](http://www.pydocstyle.org/en/stable/)
|
|
44
|
+
[](https://docs.pytest.org/en/stable/)
|
|
45
|
+
[](https://github.com/astral-sh/ruff)
|
|
46
|
+
[](https://pepy.tech/project/instagram-archiver)
|
|
47
|
+
[](https://github.com/Tatsh/instagram-archiver/stargazers)
|
|
48
|
+
|
|
49
|
+
[](https://bsky.app/profile/Tatsh.bsky.social)
|
|
50
|
+
[](https://hostux.social/@Tatsh)
|
|
51
|
+
|
|
52
|
+
Save Instagram content you have access to.
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
### Poetry
|
|
57
|
+
|
|
58
|
+
```shell
|
|
59
|
+
poetry add instagram-archiver
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Pip
|
|
63
|
+
|
|
64
|
+
```shell
|
|
65
|
+
pip install instagram-archiver
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Usage
|
|
69
|
+
|
|
70
|
+
```plain
|
|
71
|
+
Usage: instagram-archiver [OPTIONS] USERNAME
|
|
72
|
+
|
|
73
|
+
Archive a profile's posts.
|
|
74
|
+
|
|
75
|
+
Options:
|
|
76
|
+
-o, --output-dir DIRECTORY Output directory.
|
|
77
|
+
-b, --browser [brave|chrome|chromium|edge|opera|vivaldi|firefox|safari]
|
|
78
|
+
Browser to read cookies from.
|
|
79
|
+
-p, --profile TEXT Browser profile.
|
|
80
|
+
-d, --debug Enable debug output.
|
|
81
|
+
--no-log Ignore log (re-fetch everything).
|
|
82
|
+
-C, --include-comments Also download all comments (extends download
|
|
83
|
+
time significantly).
|
|
84
|
+
-h, --help Show this message and exit.
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Typical use:
|
|
88
|
+
|
|
89
|
+
```shell
|
|
90
|
+
instagram-archiver -o ~/instagram-backups/username username
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### `instagram-save-saved`
|
|
94
|
+
|
|
95
|
+
This tool saves your saved posts (at `www.instagram.com/username/saved/all-posts`).
|
|
96
|
+
|
|
97
|
+
```plain
|
|
98
|
+
Usage: instagram-save-saved [OPTIONS]
|
|
99
|
+
|
|
100
|
+
Archive your saved posts.
|
|
101
|
+
|
|
102
|
+
Options:
|
|
103
|
+
-o, --output-dir DIRECTORY Output directory.
|
|
104
|
+
-b, --browser [brave|chrome|chromium|edge|opera|vivaldi|firefox|safari]
|
|
105
|
+
Browser to read cookies from.
|
|
106
|
+
-p, --profile TEXT Browser profile.
|
|
107
|
+
-d, --debug Enable debug output.
|
|
108
|
+
-C, --include-comments Also download all comments (extends download
|
|
109
|
+
time significantly).
|
|
110
|
+
-u, --unsave Unsave posts after successful archive.
|
|
111
|
+
-h, --help Show this message and exit.
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Notes
|
|
115
|
+
|
|
116
|
+
The default output path is the username under the current working directory.
|
|
117
|
+
|
|
118
|
+
Videos are saved using yt-dlp and its respective configuration.
|
|
119
|
+
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# instagram-archiver
|
|
2
|
+
|
|
3
|
+
[](https://www.python.org/)
|
|
4
|
+
[](https://pypi.org/project/instagram-archiver/)
|
|
5
|
+
[](https://github.com/Tatsh/instagram-archiver/tags)
|
|
6
|
+
[](https://github.com/Tatsh/instagram-archiver/blob/master/LICENSE.txt)
|
|
7
|
+
[](https://github.com/Tatsh/instagram-archiver/compare/v0.3.0...master)
|
|
8
|
+
[](https://github.com/Tatsh/instagram-archiver/actions/workflows/qa.yml)
|
|
9
|
+
[](https://github.com/Tatsh/instagram-archiver/actions/workflows/tests.yml)
|
|
10
|
+
[](https://coveralls.io/github/Tatsh/instagram-archiver?branch=master)
|
|
11
|
+
[](https://instagram-archiver.readthedocs.org/?badge=latest)
|
|
12
|
+
[](http://mypy-lang.org/)
|
|
13
|
+
[](https://github.com/pre-commit/pre-commit)
|
|
14
|
+
[](http://www.pydocstyle.org/en/stable/)
|
|
15
|
+
[](https://docs.pytest.org/en/stable/)
|
|
16
|
+
[](https://github.com/astral-sh/ruff)
|
|
17
|
+
[](https://pepy.tech/project/instagram-archiver)
|
|
18
|
+
[](https://github.com/Tatsh/instagram-archiver/stargazers)
|
|
19
|
+
|
|
20
|
+
[](https://bsky.app/profile/Tatsh.bsky.social)
|
|
21
|
+
[](https://hostux.social/@Tatsh)
|
|
22
|
+
|
|
23
|
+
Save Instagram content you have access to.
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
### Poetry
|
|
28
|
+
|
|
29
|
+
```shell
|
|
30
|
+
poetry add instagram-archiver
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Pip
|
|
34
|
+
|
|
35
|
+
```shell
|
|
36
|
+
pip install instagram-archiver
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
|
|
41
|
+
```plain
|
|
42
|
+
Usage: instagram-archiver [OPTIONS] USERNAME
|
|
43
|
+
|
|
44
|
+
Archive a profile's posts.
|
|
45
|
+
|
|
46
|
+
Options:
|
|
47
|
+
-o, --output-dir DIRECTORY Output directory.
|
|
48
|
+
-b, --browser [brave|chrome|chromium|edge|opera|vivaldi|firefox|safari]
|
|
49
|
+
Browser to read cookies from.
|
|
50
|
+
-p, --profile TEXT Browser profile.
|
|
51
|
+
-d, --debug Enable debug output.
|
|
52
|
+
--no-log Ignore log (re-fetch everything).
|
|
53
|
+
-C, --include-comments Also download all comments (extends download
|
|
54
|
+
time significantly).
|
|
55
|
+
-h, --help Show this message and exit.
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Typical use:
|
|
59
|
+
|
|
60
|
+
```shell
|
|
61
|
+
instagram-archiver -o ~/instagram-backups/username username
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### `instagram-save-saved`
|
|
65
|
+
|
|
66
|
+
This tool saves your saved posts (at `www.instagram.com/username/saved/all-posts`).
|
|
67
|
+
|
|
68
|
+
```plain
|
|
69
|
+
Usage: instagram-save-saved [OPTIONS]
|
|
70
|
+
|
|
71
|
+
Archive your saved posts.
|
|
72
|
+
|
|
73
|
+
Options:
|
|
74
|
+
-o, --output-dir DIRECTORY Output directory.
|
|
75
|
+
-b, --browser [brave|chrome|chromium|edge|opera|vivaldi|firefox|safari]
|
|
76
|
+
Browser to read cookies from.
|
|
77
|
+
-p, --profile TEXT Browser profile.
|
|
78
|
+
-d, --debug Enable debug output.
|
|
79
|
+
-C, --include-comments Also download all comments (extends download
|
|
80
|
+
time significantly).
|
|
81
|
+
-u, --unsave Unsave posts after successful archive.
|
|
82
|
+
-h, --help Show this message and exit.
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Notes
|
|
86
|
+
|
|
87
|
+
The default output path is the username under the current working directory.
|
|
88
|
+
|
|
89
|
+
Videos are saved using yt-dlp and its respective configuration.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Instagram archiver."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from .client import InstagramClient
|
|
5
|
+
from .profile_scraper import ProfileScraper
|
|
6
|
+
from .saved_scraper import SavedScraper
|
|
7
|
+
|
|
8
|
+
__all__ = ('InstagramClient', 'ProfileScraper', 'SavedScraper')
|
|
9
|
+
__version__ = 'v0.3.0'
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""Generic client."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from http import HTTPStatus
|
|
5
|
+
from os import utime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Self, TypeVar, cast
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
from bs4 import BeautifulSoup as Soup
|
|
12
|
+
from requests import HTTPError
|
|
13
|
+
from yt_dlp_utils import setup_session
|
|
14
|
+
import requests
|
|
15
|
+
|
|
16
|
+
from .constants import API_HEADERS, PAGE_FETCH_HEADERS, SHARED_HEADERS
|
|
17
|
+
from .typing import (
|
|
18
|
+
CarouselMedia,
|
|
19
|
+
Comments,
|
|
20
|
+
Edge,
|
|
21
|
+
HighlightsTray,
|
|
22
|
+
MediaInfo,
|
|
23
|
+
MediaInfoItem,
|
|
24
|
+
MediaInfoItemImageVersions2Candidate,
|
|
25
|
+
)
|
|
26
|
+
from .utils import get_extension, json_dumps_formatted, write_if_new
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from collections.abc import Iterable, Mapping
|
|
30
|
+
from types import TracebackType
|
|
31
|
+
|
|
32
|
+
from .typing import BrowserName
|
|
33
|
+
|
|
34
|
+
__all__ = ('CSRFTokenNotFound', 'InstagramClient')
|
|
35
|
+
|
|
36
|
+
T = TypeVar('T')
|
|
37
|
+
log = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class CSRFTokenNotFound(RuntimeError):
|
|
41
|
+
"""CSRF token not found in cookies."""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class InstagramClient:
|
|
45
|
+
"""Generic client for Instagram."""
|
|
46
|
+
def __init__(self, browser: BrowserName = 'chrome', browser_profile: str = 'Default') -> None:
|
|
47
|
+
"""
|
|
48
|
+
Initialise the client.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
browser : str
|
|
53
|
+
The browser to use.
|
|
54
|
+
|
|
55
|
+
browser_profile : str
|
|
56
|
+
The browser profile to use.
|
|
57
|
+
"""
|
|
58
|
+
self.session = setup_session(browser,
|
|
59
|
+
browser_profile,
|
|
60
|
+
SHARED_HEADERS,
|
|
61
|
+
domains={'instagram.com'},
|
|
62
|
+
setup_retry=True,
|
|
63
|
+
status_forcelist=(413, 429, 500, 502, 503, 504))
|
|
64
|
+
self.failed_urls: set[str] = set()
|
|
65
|
+
"""Set of failed URLs."""
|
|
66
|
+
self.video_urls: list[str] = []
|
|
67
|
+
"""List of video URLs to download."""
|
|
68
|
+
|
|
69
|
+
def add_video_url(self, url: str) -> None:
|
|
70
|
+
"""Add a video URL to the list of video URLs."""
|
|
71
|
+
log.info('Added video URL: %s', url)
|
|
72
|
+
self.video_urls.append(url)
|
|
73
|
+
|
|
74
|
+
def add_csrf_token_header(self) -> None:
|
|
75
|
+
"""
|
|
76
|
+
Add CSRF token header to the session.
|
|
77
|
+
|
|
78
|
+
Raises
|
|
79
|
+
------
|
|
80
|
+
CSRFTokenNotFound
|
|
81
|
+
If the CSRF token is not found in the cookies.
|
|
82
|
+
"""
|
|
83
|
+
token = self.session.cookies.get('csrftoken')
|
|
84
|
+
if not token:
|
|
85
|
+
raise CSRFTokenNotFound
|
|
86
|
+
self.session.headers.update({'x-csrftoken': token})
|
|
87
|
+
|
|
88
|
+
def graphql_query(self,
|
|
89
|
+
variables: Mapping[str, Any],
|
|
90
|
+
*,
|
|
91
|
+
cast_to: type[T],
|
|
92
|
+
doc_id: str = '9806959572732215') -> T | None:
|
|
93
|
+
"""Make a GraphQL query."""
|
|
94
|
+
with self.session.post('https://www.instagram.com/graphql/query',
|
|
95
|
+
headers={
|
|
96
|
+
'content-type': 'application/x-www-form-urlencoded',
|
|
97
|
+
} | API_HEADERS,
|
|
98
|
+
data={
|
|
99
|
+
'doc_id': doc_id,
|
|
100
|
+
'variables': json.dumps(variables, separators=(',', ':'))
|
|
101
|
+
}) as r:
|
|
102
|
+
if r.status_code != HTTPStatus.OK:
|
|
103
|
+
return None
|
|
104
|
+
data = r.json()
|
|
105
|
+
assert isinstance(data, dict)
|
|
106
|
+
if (status := data.get('status')) != 'ok':
|
|
107
|
+
log.error('GraphQL status not "ok": %s', status)
|
|
108
|
+
return None
|
|
109
|
+
if data.get('errors'):
|
|
110
|
+
log.warning('Response has errors.')
|
|
111
|
+
log.debug('Response: %s', json.dumps(data, indent=2))
|
|
112
|
+
if not data.get('data'):
|
|
113
|
+
log.error('No data in response.')
|
|
114
|
+
return cast('T', data['data'])
|
|
115
|
+
|
|
116
|
+
def get_text(self, url: str, *, params: Mapping[str, str] | None = None) -> str:
|
|
117
|
+
"""Get text from a URL."""
|
|
118
|
+
with self.session.get(url, params=params, headers=API_HEADERS) as r:
|
|
119
|
+
r.raise_for_status()
|
|
120
|
+
return r.text
|
|
121
|
+
|
|
122
|
+
def highlights_tray(self, user_id: int | str) -> HighlightsTray:
|
|
123
|
+
"""Get the highlights tray data for a user."""
|
|
124
|
+
return self.get_json(
|
|
125
|
+
f'https://i.instagram.com/api/v1/highlights/{user_id}/highlights_tray/',
|
|
126
|
+
cast_to=HighlightsTray)
|
|
127
|
+
|
|
128
|
+
def __enter__(self) -> Self: # pragma: no cover
|
|
129
|
+
"""Recommended way to initialise the client."""
|
|
130
|
+
return self
|
|
131
|
+
|
|
132
|
+
def __exit__(self, _: type[BaseException] | None, __: BaseException | None,
|
|
133
|
+
___: TracebackType | None) -> None:
|
|
134
|
+
"""Clean up."""
|
|
135
|
+
|
|
136
|
+
def is_saved(self, url: str) -> bool: # pragma: no cover
|
|
137
|
+
"""Check if a URL is already saved."""
|
|
138
|
+
return False
|
|
139
|
+
|
|
140
|
+
def save_to_log(self, url: str) -> None:
|
|
141
|
+
"""Save a URL to the log."""
|
|
142
|
+
|
|
143
|
+
def save_image_versions2(self, sub_item: CarouselMedia | MediaInfoItem, timestamp: int) -> None:
|
|
144
|
+
"""Save images in the image_versions2 dictionary."""
|
|
145
|
+
def key(x: MediaInfoItemImageVersions2Candidate) -> int:
|
|
146
|
+
return x['width'] * x['height']
|
|
147
|
+
|
|
148
|
+
best = max(sub_item['image_versions2']['candidates'], key=key)
|
|
149
|
+
if self.is_saved(best['url']):
|
|
150
|
+
return
|
|
151
|
+
r = self.session.head(best['url'])
|
|
152
|
+
if r.status_code != HTTPStatus.OK:
|
|
153
|
+
log.warning('HEAD request failed with status code %s.', r.status_code)
|
|
154
|
+
return
|
|
155
|
+
ext = get_extension(r.headers['content-type'])
|
|
156
|
+
name = f'{sub_item["id"]}.{ext}'
|
|
157
|
+
with Path(name).open('wb') as f:
|
|
158
|
+
f.writelines(self.session.get(best['url'], stream=True).iter_content(chunk_size=512))
|
|
159
|
+
utime(name, (timestamp, timestamp))
|
|
160
|
+
self.save_to_log(r.url)
|
|
161
|
+
|
|
162
|
+
def save_comments(self, edge: Edge) -> None:
|
|
163
|
+
"""Save comments for an edge node."""
|
|
164
|
+
comment_url = ('https://www.instagram.com/api/v1/media/'
|
|
165
|
+
f'{edge["node"]["id"]}/comments/')
|
|
166
|
+
shared_params = {'can_support_threading': 'true'}
|
|
167
|
+
try:
|
|
168
|
+
comment_data = self.get_json(comment_url,
|
|
169
|
+
params={
|
|
170
|
+
**shared_params, 'permalink_enabled': 'false'
|
|
171
|
+
},
|
|
172
|
+
cast_to=Comments)
|
|
173
|
+
except HTTPError:
|
|
174
|
+
log.exception('Failed to get comments.')
|
|
175
|
+
return
|
|
176
|
+
top_comment_data: Any = comment_data
|
|
177
|
+
while comment_data['can_view_more_preview_comments'] and comment_data['next_min_id']:
|
|
178
|
+
try:
|
|
179
|
+
comment_data = self.get_json(comment_url,
|
|
180
|
+
params={
|
|
181
|
+
**shared_params,
|
|
182
|
+
'min_id':
|
|
183
|
+
comment_data['next_min_id'],
|
|
184
|
+
},
|
|
185
|
+
cast_to=Comments)
|
|
186
|
+
except HTTPError:
|
|
187
|
+
log.exception('Failed to get comments.')
|
|
188
|
+
break
|
|
189
|
+
top_comment_data['comments'] = (list(top_comment_data['comments']) +
|
|
190
|
+
list(comment_data['comments']))
|
|
191
|
+
comments_json = f'{edge["node"]["id"]}-comments.json'
|
|
192
|
+
with Path(comments_json).open('w+', encoding='utf-8') as f:
|
|
193
|
+
json.dump(top_comment_data, f, sort_keys=True, indent=2)
|
|
194
|
+
|
|
195
|
+
def save_media(self, edge: Edge) -> None:
|
|
196
|
+
"""Save media for an edge node."""
|
|
197
|
+
log.info('Saving media at URL: https://www.instagram.com/p/%s', edge['node']['code'])
|
|
198
|
+
media_info_url = f'https://www.instagram.com/p/{edge["node"]["code"]}/'
|
|
199
|
+
if self.is_saved(media_info_url):
|
|
200
|
+
return
|
|
201
|
+
r = self.session.get(media_info_url, headers=PAGE_FETCH_HEADERS)
|
|
202
|
+
if r.status_code != HTTPStatus.OK:
|
|
203
|
+
log.warning('GET request failed with status code %s.', r.status_code)
|
|
204
|
+
return
|
|
205
|
+
if 'image_versions2' not in r.text or 'taken_at' not in r.text:
|
|
206
|
+
log.warning('Invalid response. image_versions2 dict not found.')
|
|
207
|
+
return
|
|
208
|
+
soup = Soup(r.text, 'html5lib')
|
|
209
|
+
media_info_embedded = next(
|
|
210
|
+
json.loads(s) for s in (''.join(
|
|
211
|
+
getattr(c, 'text', '') for c in getattr(script, 'contents', ''))
|
|
212
|
+
for script in soup.select('script[type="application/json"]'))
|
|
213
|
+
if 'image_versions2' in s and 'taken_at' in s)
|
|
214
|
+
media_info: MediaInfo = (
|
|
215
|
+
media_info_embedded['require'][0][3][0]['__bbox']['require'][0][3][1]['__bbox']
|
|
216
|
+
['result']['data']['xdt_api__v1__media__shortcode__web_info'])
|
|
217
|
+
timestamp = media_info['items'][0]['taken_at']
|
|
218
|
+
id_json_file = f'{edge["node"]["id"]}.json'
|
|
219
|
+
media_info_json_file = f'{edge["node"]["id"]}-media-info-0000.json'
|
|
220
|
+
write_if_new(id_json_file, str(json_dumps_formatted(edge['node'])))
|
|
221
|
+
write_if_new(media_info_json_file, str(json_dumps_formatted(media_info)))
|
|
222
|
+
for file in (id_json_file, media_info_json_file):
|
|
223
|
+
utime(file, (timestamp, timestamp))
|
|
224
|
+
self.save_to_log(media_info_url)
|
|
225
|
+
for item in media_info['items']:
|
|
226
|
+
timestamp = item['taken_at']
|
|
227
|
+
if (carousel_media := item.get('carousel_media')):
|
|
228
|
+
for sub_item in carousel_media:
|
|
229
|
+
self.save_image_versions2(sub_item, timestamp)
|
|
230
|
+
elif 'image_versions2' in item:
|
|
231
|
+
self.save_image_versions2(item, timestamp)
|
|
232
|
+
|
|
233
|
+
def save_edges(self, edges: Iterable[Edge], parent_edge: Edge | None = None) -> None:
|
|
234
|
+
"""Save edge node media."""
|
|
235
|
+
for edge in edges:
|
|
236
|
+
if edge['node']['__typename'] == 'XDTMediaDict':
|
|
237
|
+
try:
|
|
238
|
+
shortcode = edge['node']['code']
|
|
239
|
+
except KeyError:
|
|
240
|
+
if parent_edge:
|
|
241
|
+
try:
|
|
242
|
+
shortcode = parent_edge['node']['code']
|
|
243
|
+
except KeyError:
|
|
244
|
+
log.exception('Unknown shortcode.')
|
|
245
|
+
return
|
|
246
|
+
else:
|
|
247
|
+
log.exception('Unknown shortcode.')
|
|
248
|
+
if edge['node'].get('video_dash_manifest'):
|
|
249
|
+
self.add_video_url(f'https://www.instagram.com/p/{shortcode}')
|
|
250
|
+
else:
|
|
251
|
+
try:
|
|
252
|
+
self.save_comments(edge)
|
|
253
|
+
self.save_media(edge)
|
|
254
|
+
except requests.exceptions.RetryError:
|
|
255
|
+
log.exception('Retries exhausted.')
|
|
256
|
+
return
|
|
257
|
+
else:
|
|
258
|
+
log.warning( # type: ignore[unreachable]
|
|
259
|
+
'Unknown type: `%s`. Item %s will not be processed.',
|
|
260
|
+
edge['node']['__typename'], edge['node']['id'])
|
|
261
|
+
shortcode = edge['node']['code']
|
|
262
|
+
self.failed_urls.add(f'https://www.instagram.com/p/{shortcode}')
|
|
263
|
+
|
|
264
|
+
def get_json(self, url: str, *, cast_to: type[T], params: Mapping[str, str] | None = None) -> T:
|
|
265
|
+
"""Get JSON data from a URL."""
|
|
266
|
+
with self.session.get(url, params=params, headers=API_HEADERS) as r:
|
|
267
|
+
r.raise_for_status()
|
|
268
|
+
return cast('T', r.json())
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Constants."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
__all__ = ('API_HEADERS', 'BROWSER_CHOICES', 'PAGE_FETCH_HEADERS', 'SHARED_HEADERS', 'USER_AGENT')
|
|
5
|
+
|
|
6
|
+
USER_AGENT = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
7
|
+
'Chrome/137.0.0.0 Safari/537.36')
|
|
8
|
+
"""
|
|
9
|
+
User agent.
|
|
10
|
+
|
|
11
|
+
:meta hide-value:
|
|
12
|
+
"""
|
|
13
|
+
SHARED_HEADERS = {
|
|
14
|
+
'accept': '*/*',
|
|
15
|
+
'authority': 'www.instagram.com',
|
|
16
|
+
'cache-control': 'no-cache',
|
|
17
|
+
'dnt': '1',
|
|
18
|
+
'pragma': 'no-cache',
|
|
19
|
+
'user-agent': USER_AGENT,
|
|
20
|
+
# 'x-asbd-id': '359341',
|
|
21
|
+
# 'x-ig-app-id': '936619743392459',
|
|
22
|
+
}
|
|
23
|
+
"""
|
|
24
|
+
Headers to use for requests.
|
|
25
|
+
|
|
26
|
+
:meta hide-value:
|
|
27
|
+
"""
|
|
28
|
+
API_HEADERS = {
|
|
29
|
+
'x-asbd-id': '359341',
|
|
30
|
+
'x-ig-app-id': '936619743392459',
|
|
31
|
+
}
|
|
32
|
+
"""
|
|
33
|
+
Headers to use for API requests.
|
|
34
|
+
|
|
35
|
+
:meta hide-value:
|
|
36
|
+
"""
|
|
37
|
+
PAGE_FETCH_HEADERS = {
|
|
38
|
+
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,'
|
|
39
|
+
'image/apng,*/*;q=0.8',
|
|
40
|
+
'dpr': '1.5',
|
|
41
|
+
'sec-fetch-mode': 'navigate', # Definitely required.
|
|
42
|
+
'viewport-width': '3840',
|
|
43
|
+
}
|
|
44
|
+
"""
|
|
45
|
+
Headers to use for fetching HTML pages.
|
|
46
|
+
|
|
47
|
+
:meta hide-value:
|
|
48
|
+
"""
|
|
49
|
+
LOG_SCHEMA = """CREATE TABLE log (
|
|
50
|
+
url TEXT PRIMARY KEY NOT NULL,
|
|
51
|
+
date TEXT DEFAULT CURRENT_TIMESTAMP NOT NULL
|
|
52
|
+
);"""
|
|
53
|
+
"""
|
|
54
|
+
Schema for log database.
|
|
55
|
+
|
|
56
|
+
:meta hide-value:
|
|
57
|
+
"""
|
|
58
|
+
BROWSER_CHOICES = ('brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'firefox', 'safari')
|
|
59
|
+
"""
|
|
60
|
+
Possible browser choices to get cookies from.
|
|
61
|
+
|
|
62
|
+
:meta hide-value:
|
|
63
|
+
"""
|