PyPI - instagram-archiver - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

instagram-archiver 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of instagram-archiver might be problematic. Click here for more details.

Files changed (20) hide show

instagram_archiver/__init__.py +8 -2
instagram_archiver/__main__.py +6 -0
instagram_archiver/client.py +219 -260
instagram_archiver/constants.py +52 -92
instagram_archiver/main.py +78 -46
instagram_archiver/profile_scraper.py +194 -0
instagram_archiver/py.typed +0 -0
instagram_archiver/saved_scraper.py +78 -0
instagram_archiver/typing.py +170 -0
instagram_archiver/utils.py +98 -74
instagram_archiver-0.3.0.dist-info/LICENSE.txt +18 -0
instagram_archiver-0.3.0.dist-info/METADATA +119 -0
instagram_archiver-0.3.0.dist-info/RECORD +15 -0
{instagram_archiver-0.2.0.dist-info → instagram_archiver-0.3.0.dist-info}/WHEEL +1 -1
instagram_archiver-0.3.0.dist-info/entry_points.txt +4 -0
instagram_archiver/ig_typing.py +0 -117
instagram_archiver-0.2.0.dist-info/LICENSE.txt +0 -21
instagram_archiver-0.2.0.dist-info/METADATA +0 -37
instagram_archiver-0.2.0.dist-info/RECORD +0 -11
instagram_archiver-0.2.0.dist-info/entry_points.txt +0 -3

instagram_archiver/__init__.py CHANGED Viewed

@@ -1,3 +1,9 @@
-from .main import main
+"""Instagram archiver."""
+from __future__ import annotations
-__all__ = ('main',)
+from .client import InstagramClient
+from .profile_scraper import ProfileScraper
+from .saved_scraper import SavedScraper
+__all__ = ('InstagramClient', 'ProfileScraper', 'SavedScraper')
+__version__ = 'v0.3.0'

instagram_archiver/__main__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Entry point for ``python -m`` invocation."""
+from __future__ import annotations
+from .main import main
+main()

instagram_archiver/client.py CHANGED Viewed

@@ -1,168 +1,219 @@
-from inspect import Traceback
-from os import makedirs, utime
+"""Generic client."""
+from __future__ import annotations
+from http import HTTPStatus
+from os import utime
 from pathlib import Path
-from pprint import pprint as pp
-from typing import Collection, Literal, Mapping, Type, TypeVar, overload
-from urllib.parse import urlparse
+from typing import TYPE_CHECKING, Any, Self, TypeVar, cast
 import json
-import re
-import sqlite3
+import logging
-from loguru import logger
-from ratelimit import limits, sleep_and_retry
-from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
-from yt_dlp.cookies import extract_cookies_from_browser
+from bs4 import BeautifulSoup as Soup
+from requests import HTTPError
+from yt_dlp_utils import setup_session
 import requests
-import yt_dlp
-from .constants import LOG_SCHEMA, RETRY_ABORT_NUM, SHARED_HEADERS, SHARED_YT_DLP_OPTIONS
-from .ig_typing import (CarouselMedia, Comments, Edge, HighlightsTray, MediaInfo, MediaInfoItem,
-                        MediaInfoItemImageVersions2Candidate, WebProfileInfo)
-from .utils import chdir, get_extension, json_dumps_formatted, write_if_new
+from .constants import API_HEADERS, PAGE_FETCH_HEADERS, SHARED_HEADERS
+from .typing import (
+    CarouselMedia,
+    Comments,
+    Edge,
+    HighlightsTray,
+    MediaInfo,
+    MediaInfoItem,
+    MediaInfoItemImageVersions2Candidate,
+)
+from .utils import get_extension, json_dumps_formatted, write_if_new
-__all__ = ('InstagramClient',)
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Mapping
+    from types import TracebackType
-Browser = Literal['brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'firefox', 'safari']
-T = TypeVar('T')
+    from .typing import BrowserName
+__all__ = ('CSRFTokenNotFound', 'InstagramClient')
-def _clean_url(url: str) -> str:
-    parsed = urlparse(url)
-    return f'https://{parsed.netloc}{parsed.path}'
+T = TypeVar('T')
+log = logging.getLogger(__name__)
-class AuthenticationError(Exception):
-    pass
+class CSRFTokenNotFound(RuntimeError):
+    """CSRF token not found in cookies."""
 class InstagramClient:
-    def __init__(self,
-                 *,
-                 username: str,
-                 log_file: str | Path | None = None,
-                 output_dir: str | None = None,
-                 disable_log: bool = False,
-                 browser: Browser = 'chrome',
-                 browser_profile: str = 'Default',
-                 debug: bool = False,
-                 comments: bool = False) -> None:
-        self._no_log = disable_log
-        self._session = requests.Session()
-        self._browser = browser
-        self._browser_profile = browser_profile
-        self._setup_session(browser, browser_profile)
-        self._output_dir = Path(output_dir or Path('.').resolve() / username)
-        makedirs(self._output_dir, exist_ok=True)
-        self._log_db = Path(log_file or self._output_dir / '.log.db')
-        self._connection = sqlite3.connect(self._log_db)
-        self._cursor = self._connection.cursor()
-        self._setup_db()
-        self._username = username
-        self._video_urls: list[str] = []
-        self._debug = debug
-        self._get_comments = comments
+    """Generic client for Instagram."""
+    def __init__(self, browser: BrowserName = 'chrome', browser_profile: str = 'Default') -> None:
+        """
+        Initialise the client.
-    def _add_video_url(self, url: str) -> None:
-        logger.debug(f'Added video URL: {url}')
-        self._video_urls.append(url)
+        Parameters
+        ----------
+        browser : str
+            The browser to use.
-    def _setup_db(self) -> None:
-        existed = self._log_db.exists()
-        if not existed or (existed and self._log_db.stat().st_size == 0):
-            logger.debug('Creating schema')
-            self._cursor.execute(LOG_SCHEMA)
+        browser_profile : str
+            The browser profile to use.
+        """
+        self.session = setup_session(browser,
+                                     browser_profile,
+                                     SHARED_HEADERS,
+                                     domains={'instagram.com'},
+                                     setup_retry=True,
+                                     status_forcelist=(413, 429, 500, 502, 503, 504))
+        self.failed_urls: set[str] = set()
+        """Set of failed URLs."""
+        self.video_urls: list[str] = []
+        """List of video URLs to download."""
-    def _setup_session(self,
-                       browser: Literal['brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi',
-                                        'firefox', 'safari'] = 'chrome',
-                       browser_profile: str = 'Default') -> None:
-        self._session.mount(
-            'https://',
-            HTTPAdapter(max_retries=Retry(
-                backoff_factor=1.5,  # wait times are normally 1 and 3 seconds
-                redirect=0,
-                status=0,
-                respect_retry_after_header=False,
-                status_forcelist=frozenset((413, 429, 500, 502, 503, 504)),
-                total=RETRY_ABORT_NUM)))
-        self._session.headers.update({
-            **SHARED_HEADERS,
-            **dict(cookie='; '.join(f'{cookie.name}={cookie.value}' \
-                for cookie in extract_cookies_from_browser(browser, browser_profile)
-                    if 'instagram.com' in cookie.domain))
-        })
-        r = self._get_rate_limited('https://www.instagram.com', return_json=False)
-        m = re.search(r'"config":{"csrf_token":"([^"]+)"', r.text)
-        assert m is not None
-        self._session.headers.update({'x-csrftoken': m.group(1)})
+    def add_video_url(self, url: str) -> None:
+        """Add a video URL to the list of video URLs."""
+        log.info('Added video URL: %s', url)
+        self.video_urls.append(url)
-    def _save_to_log(self, url: str) -> None:
-        if self._no_log:
-            return
-        self._cursor.execute('INSERT INTO log (url) VALUES (?)', (_clean_url(url),))
-        self._connection.commit()
+    def add_csrf_token_header(self) -> None:
+        """
+        Add CSRF token header to the session.
+        Raises
+        ------
+        CSRFTokenNotFound
+            If the CSRF token is not found in the cookies.
+        """
+        token = self.session.cookies.get('csrftoken')
+        if not token:
+            raise CSRFTokenNotFound
+        self.session.headers.update({'x-csrftoken': token})
+    def graphql_query(self,
+                      variables: Mapping[str, Any],
+                      *,
+                      cast_to: type[T],
+                      doc_id: str = '9806959572732215') -> T | None:
+        """Make a GraphQL query."""
+        with self.session.post('https://www.instagram.com/graphql/query',
+                               headers={
+                                   'content-type': 'application/x-www-form-urlencoded',
+                               } | API_HEADERS,
+                               data={
+                                   'doc_id': doc_id,
+                                   'variables': json.dumps(variables, separators=(',', ':'))
+                               }) as r:
+            if r.status_code != HTTPStatus.OK:
+                return None
+            data = r.json()
+            assert isinstance(data, dict)
+            if (status := data.get('status')) != 'ok':
+                log.error('GraphQL status not "ok": %s', status)
+                return None
+            if data.get('errors'):
+                log.warning('Response has errors.')
+                log.debug('Response: %s', json.dumps(data, indent=2))
+            if not data.get('data'):
+                log.error('No data in response.')
+            return cast('T', data['data'])
+    def get_text(self, url: str, *, params: Mapping[str, str] | None = None) -> str:
+        """Get text from a URL."""
+        with self.session.get(url, params=params, headers=API_HEADERS) as r:
+            r.raise_for_status()
+            return r.text
+    def highlights_tray(self, user_id: int | str) -> HighlightsTray:
+        """Get the highlights tray data for a user."""
+        return self.get_json(
+            f'https://i.instagram.com/api/v1/highlights/{user_id}/highlights_tray/',
+            cast_to=HighlightsTray)
+    def __enter__(self) -> Self:  # pragma: no cover
+        """Recommended way to initialise the client."""
+        return self
+    def __exit__(self, _: type[BaseException] | None, __: BaseException | None,
+                 ___: TracebackType | None) -> None:
+        """Clean up."""
+    def is_saved(self, url: str) -> bool:  # pragma: no cover
+        """Check if a URL is already saved."""
+        return False
-    def _is_saved(self, url: str) -> bool:
-        if self._no_log:
-            return False
-        self._cursor.execute('SELECT COUNT(url) FROM log WHERE url = ?', (_clean_url(url),))
-        count: int
-        count, = self._cursor.fetchone()
-        return count == 1
+    def save_to_log(self, url: str) -> None:
+        """Save a URL to the log."""
-    def _save_image_versions2(self, sub_item: CarouselMedia | MediaInfoItem,
-                              timestamp: int) -> None:
+    def save_image_versions2(self, sub_item: CarouselMedia | MediaInfoItem, timestamp: int) -> None:
+        """Save images in the image_versions2 dictionary."""
         def key(x: MediaInfoItemImageVersions2Candidate) -> int:
             return x['width'] * x['height']
-        best = sorted(sub_item['image_versions2']['candidates'], key=key, reverse=True)[0]
-        if self._is_saved(best['url']):
+        best = max(sub_item['image_versions2']['candidates'], key=key)
+        if self.is_saved(best['url']):
+            return
+        r = self.session.head(best['url'])
+        if r.status_code != HTTPStatus.OK:
+            log.warning('HEAD request failed with status code %s.', r.status_code)
             return
-        r = self._session.head(best['url'])
-        r.raise_for_status()
         ext = get_extension(r.headers['content-type'])
         name = f'{sub_item["id"]}.{ext}'
-        with open(name, 'wb') as f:
-            for content in (self._session.get(best['url'],
-                                              stream=True).iter_content(chunk_size=512)):
-                f.write(content)
+        with Path(name).open('wb') as f:
+            f.writelines(self.session.get(best['url'], stream=True).iter_content(chunk_size=512))
         utime(name, (timestamp, timestamp))
-        self._save_to_log(r.url)
+        self.save_to_log(r.url)
-    def _save_comments(self, edge: Edge) -> None:
-        if self._get_comments:
-            comment_url = ('https://www.instagram.com/api/v1/media/'
-                           f'{edge["node"]["id"]}/comments/')
-            shared_params = dict(can_support_threading='true')
-            top_comment_data = comment_data = self._get_rate_limited(
-                comment_url,
-                params={
-                    **shared_params, 'permalink_enabled': 'false'
-                },
-                cast_to=Comments)
-            while comment_data['can_view_more_preview_comments'] and comment_data['next_min_id']:
-                comment_data = self._get_rate_limited(comment_url,
-                                                      params={
-                                                          **shared_params,
-                                                          'min_id':
-                                                              comment_data['next_min_id'],
-                                                      },
-                                                      cast_to=Comments)
-                top_comment_data['comments'].extend(comment_data['comments'])
-            comments_json = f'{edge["node"]["id"]}-comments.json'
-            with open(comments_json, 'w+') as f:
-                json.dump(top_comment_data, f, sort_keys=True, indent=2)
+    def save_comments(self, edge: Edge) -> None:
+        """Save comments for an edge node."""
+        comment_url = ('https://www.instagram.com/api/v1/media/'
+                       f'{edge["node"]["id"]}/comments/')
+        shared_params = {'can_support_threading': 'true'}
+        try:
+            comment_data = self.get_json(comment_url,
+                                         params={
+                                             **shared_params, 'permalink_enabled': 'false'
+                                         },
+                                         cast_to=Comments)
+        except HTTPError:
+            log.exception('Failed to get comments.')
+            return
+        top_comment_data: Any = comment_data
+        while comment_data['can_view_more_preview_comments'] and comment_data['next_min_id']:
+            try:
+                comment_data = self.get_json(comment_url,
+                                             params={
+                                                 **shared_params,
+                                                 'min_id':
+                                                     comment_data['next_min_id'],
+                                             },
+                                             cast_to=Comments)
+            except HTTPError:
+                log.exception('Failed to get comments.')
+                break
+            top_comment_data['comments'] = (list(top_comment_data['comments']) +
+                                            list(comment_data['comments']))
+        comments_json = f'{edge["node"]["id"]}-comments.json'
+        with Path(comments_json).open('w+', encoding='utf-8') as f:
+            json.dump(top_comment_data, f, sort_keys=True, indent=2)
-    def _save_media(self, edge: Edge) -> None:
-        media_info_url = ('https://i.instagram.com/api/v1/media/'
-                          f'{edge["node"]["id"]}/info/')
-        if self._is_saved(media_info_url):
+    def save_media(self, edge: Edge) -> None:
+        """Save media for an edge node."""
+        log.info('Saving media at URL: https://www.instagram.com/p/%s', edge['node']['code'])
+        media_info_url = f'https://www.instagram.com/p/{edge["node"]["code"]}/'
+        if self.is_saved(media_info_url):
+            return
+        r = self.session.get(media_info_url, headers=PAGE_FETCH_HEADERS)
+        if r.status_code != HTTPStatus.OK:
+            log.warning('GET request failed with status code %s.', r.status_code)
+            return
+        if 'image_versions2' not in r.text or 'taken_at' not in r.text:
+            log.warning('Invalid response. image_versions2 dict not found.')
             return
-        media_info = self._get_rate_limited(media_info_url, cast_to=MediaInfo)
-        if media_info['more_available'] or media_info['num_results'] != 1:
-            pp(media_info)
-            raise ValueError('Unhandled more_available set to True')
+        soup = Soup(r.text, 'html5lib')
+        media_info_embedded = next(
+            json.loads(s) for s in (''.join(
+                getattr(c, 'text', '') for c in getattr(script, 'contents', ''))
+                                    for script in soup.select('script[type="application/json"]'))
+            if 'image_versions2' in s and 'taken_at' in s)
+        media_info: MediaInfo = (
+            media_info_embedded['require'][0][3][0]['__bbox']['require'][0][3][1]['__bbox']
+            ['result']['data']['xdt_api__v1__media__shortcode__web_info'])
         timestamp = media_info['items'][0]['taken_at']
         id_json_file = f'{edge["node"]["id"]}.json'
         media_info_json_file = f'{edge["node"]["id"]}-media-info-0000.json'
@@ -170,140 +221,48 @@ class InstagramClient:
         write_if_new(media_info_json_file, str(json_dumps_formatted(media_info)))
         for file in (id_json_file, media_info_json_file):
             utime(file, (timestamp, timestamp))
-        self._save_to_log(media_info_url)
+        self.save_to_log(media_info_url)
         for item in media_info['items']:
             timestamp = item['taken_at']
-            if 'carousel_media' in item:
-                for sub_item in item['carousel_media']:
-                    self._save_image_versions2(sub_item, timestamp)
+            if (carousel_media := item.get('carousel_media')):
+                for sub_item in carousel_media:
+                    self.save_image_versions2(sub_item, timestamp)
             elif 'image_versions2' in item:
-                self._save_image_versions2(item, timestamp)
+                self.save_image_versions2(item, timestamp)
-    def _save_stuff(self, edges: Collection[Edge], parent_edge: Edge | None = None) -> None:
+    def save_edges(self, edges: Iterable[Edge], parent_edge: Edge | None = None) -> None:
+        """Save edge node media."""
         for edge in edges:
-            if edge['node']['__typename'] == 'GraphVideo':
+            if edge['node']['__typename'] == 'XDTMediaDict':
                 try:
-                    shortcode = edge['node']['shortcode']
-                except KeyError as e:
+                    shortcode = edge['node']['code']
+                except KeyError:
                     if parent_edge:
                         try:
-                            shortcode = parent_edge['node']['shortcode']
-                        except KeyError as exc:
-                            raise ValueError('Unknown shortcode') from exc
+                            shortcode = parent_edge['node']['code']
+                        except KeyError:
+                            log.exception('Unknown shortcode.')
+                            return
                     else:
-                        raise ValueError('Unknown shortcode') from e
-                self._add_video_url(f'https://www.instagram.com/p/{shortcode}')
-            elif edge['node']['__typename'] == 'GraphImage':
-                self._save_media(edge)
-            elif edge['node']['__typename'] == 'GraphSidecar':
-                logger.debug('Recursion into child edges')
-                if (not edge['node']['comments_disabled']
-                        and edge['node']['edge_media_to_comment']['count']):
-                    self._save_comments(edge)
-                self._save_stuff(edge['node']['edge_sidecar_to_children']['edges'], edge)
+                        log.exception('Unknown shortcode.')
+                if edge['node'].get('video_dash_manifest'):
+                    self.add_video_url(f'https://www.instagram.com/p/{shortcode}')
+                else:
+                    try:
+                        self.save_comments(edge)
+                        self.save_media(edge)
+                    except requests.exceptions.RetryError:
+                        log.exception('Retries exhausted.')
+                        return
             else:
-                raise ValueError(f'Unknown type "{edge["node"]["__typename"]}"')
-    @overload
-    def _get_rate_limited(self, url: str, *, cast_to: Type[T]) -> T:
-        pass
-    @overload
-    def _get_rate_limited(self,
-                          url: str,
-                          *,
-                          return_json: Literal[False] = False) -> requests.Response:
-        pass
-    @overload
-    def _get_rate_limited(self,
-                          url: str,
-                          *,
-                          params: Mapping[str, str] | None = None,
-                          cast_to: Type[T]) -> T:
-        pass
+                log.warning(  # type: ignore[unreachable]
+                    'Unknown type: `%s`. Item %s will not be processed.',
+                    edge['node']['__typename'], edge['node']['id'])
+                shortcode = edge['node']['code']
+                self.failed_urls.add(f'https://www.instagram.com/p/{shortcode}')
-    @sleep_and_retry
-    @limits(calls=10, period=60)
-    def _get_rate_limited(
-            self,
-            url: str,
-            *,
-            return_json: bool = True,
-            params: Mapping[str, str] | None = None,
-            cast_to: Type[T] | None = None) -> T | requests.Response:  # pylint: disable=unused-argument
-        with self._session.get(url, params=params) as r:
+    def get_json(self, url: str, *, cast_to: type[T], params: Mapping[str, str] | None = None) -> T:
+        """Get JSON data from a URL."""
+        with self.session.get(url, params=params, headers=API_HEADERS) as r:
             r.raise_for_status()
-            return r.json() if return_json else r
-    def _highlights_tray(self, user_id: int | str) -> HighlightsTray:
-        return self._get_rate_limited(
-            f'https://i.instagram.com/api/v1/highlights/{user_id}/'
-            'highlights_tray/',
-            cast_to=HighlightsTray)
-    def __enter__(self) -> 'InstagramClient':
-        return self
-    def __exit__(self, _: Type[BaseException], __: BaseException, ___: Traceback) -> None:
-        self._cursor.close()
-        self._connection.close()
-    def process(self) -> None:
-        with chdir(self._output_dir):
-            self._get_rate_limited(f'https://www.instagram.com/{self._username}/',
-                                   return_json=False)
-            r = self._get_rate_limited('https://i.instagram.com/api/v1/users/web_profile_info/',
-                                       params={'username': self._username},
-                                       cast_to=WebProfileInfo)
-            with open('web_profile_info.json', 'w') as f:
-                json.dump(r, f, indent=2, sort_keys=True)
-            user_info = r['data']['user']
-            if not self._is_saved(user_info['profile_pic_url_hd']):
-                with open('profile_pic.jpg', 'wb') as f:
-                    for chunk in self._session.get(user_info['profile_pic_url_hd'],
-                                                   stream=True).iter_content(chunk_size=512):
-                        f.write(chunk)
-                self._save_to_log(user_info['profile_pic_url_hd'])
-            for item in self._highlights_tray(user_info['id'])['tray']:
-                self._add_video_url('https://www.instagram.com/stories/highlights/'
-                                    f'{item["id"].split(":")[-1]}/')
-            self._save_stuff(user_info['edge_owner_to_timeline_media']['edges'])
-            page_info = user_info['edge_owner_to_timeline_media']['page_info']
-            while page_info['has_next_page']:
-                params = dict(query_hash='69cba40317214236af40e7efa697781d',
-                              variables=json.dumps(
-                                  dict(id=user_info['id'], first=12,
-                                       after=page_info['end_cursor'])))
-                media = self._get_rate_limited(
-                    'https://www.instagram.com/graphql/query/',
-                    params=params,
-                    cast_to=WebProfileInfo)['data']['user']['edge_owner_to_timeline_media']
-                page_info = media['page_info']
-                self._save_stuff(media['edges'])
-            if len(self._video_urls) > 0:
-                with yt_dlp.YoutubeDL({
-                        **SHARED_YT_DLP_OPTIONS,  # type: ignore[misc]
-                        **{
-                            'cookiesfrombrowser': [
-                                self._browser, self._browser_profile, None, None
-                            ],
-                            'getcomments': self._get_comments,
-                            'verbose': self._debug
-                        }
-                }) as ydl:
-                    failed_urls: list[str] = []
-                    while (self._video_urls and (url := self._video_urls.pop())):
-                        if self._is_saved(url):
-                            logger.debug(f'{url} is already saved')
-                            continue
-                        if ydl.extract_info(url):
-                            logger.debug(f'Extracting {url}')
-                            self._save_to_log(url)
-                        else:
-                            failed_urls.append(url)
-                    if len(failed_urls) > 0:
-                        logger.error('Some video URIs failed. Check failed.txt.')
-                        with open('failed.txt', 'w') as f:
-                            for url in failed_urls:
-                                f.write(f'{url}\n')
+            return cast('T', r.json())

instagram-archiver 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

instagram-archiver 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl