PyPI - instagram-archiver - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

instagram-archiver 0.2.1py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of instagram-archiver might be problematic. Click here for more details.

Files changed (20) hide show

instagram_archiver/__init__.py +8 -2
instagram_archiver/__main__.py +4 -1
instagram_archiver/client.py +232 -270
instagram_archiver/constants.py +51 -95
instagram_archiver/main.py +80 -57
instagram_archiver/profile_scraper.py +194 -0
instagram_archiver/saved_scraper.py +79 -0
instagram_archiver/typing.py +172 -0
instagram_archiver/utils.py +97 -79
instagram_archiver-0.3.1.dist-info/LICENSE.txt +18 -0
instagram_archiver-0.3.1.dist-info/METADATA +119 -0
instagram_archiver-0.3.1.dist-info/RECORD +15 -0
{instagram_archiver-0.2.1.dist-info → instagram_archiver-0.3.1.dist-info}/WHEEL +1 -1
instagram_archiver-0.3.1.dist-info/entry_points.txt +4 -0
instagram_archiver/find_query_hashes.py +0 -31
instagram_archiver/ig_typing.py +0 -173
instagram_archiver-0.2.1.dist-info/LICENSE.txt +0 -21
instagram_archiver-0.2.1.dist-info/METADATA +0 -44
instagram_archiver-0.2.1.dist-info/RECORD +0 -14
instagram_archiver-0.2.1.dist-info/entry_points.txt +0 -3

instagram_archiver/__init__.py CHANGED Viewed

@@ -1,3 +1,9 @@
-from .main import main as instagram_archiver
+"""Instagram archiver."""
+from __future__ import annotations
-__all__ = ('instagram_archiver',)
+from .client import InstagramClient
+from .profile_scraper import ProfileScraper
+from .saved_scraper import SavedScraper
+__all__ = ('InstagramClient', 'ProfileScraper', 'SavedScraper')
+__version__ = 'v0.3.1'

instagram_archiver/__main__.py CHANGED Viewed

@@ -1,3 +1,6 @@
+"""Entry point for ``python -m`` invocation."""
+from __future__ import annotations
 from .main import main
-main()  # pylint: disable=no-value-for-parameter
+main()

instagram_archiver/client.py CHANGED Viewed

@@ -1,168 +1,223 @@
-from copy import deepcopy
-from inspect import Traceback
-from os import makedirs, utime
+"""Generic client."""
+from __future__ import annotations
+from http import HTTPStatus
+from os import utime
 from pathlib import Path
-from pprint import pprint as pp
-from typing import Collection, Literal, Mapping, Type, TypeVar, overload
-from urllib.parse import urlparse
+from typing import TYPE_CHECKING, Any, Self, TypeVar, cast
 import json
-import re
-import sqlite3
-from loguru import logger
-from ratelimit import limits, sleep_and_retry
-from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
-from yt_dlp.cookies import extract_cookies_from_browser
+import logging
+from requests import HTTPError
+from yt_dlp_utils import setup_session
 import requests
-import yt_dlp
-from .constants import LOG_SCHEMA, RETRY_ABORT_NUM, SHARED_HEADERS, SHARED_YT_DLP_OPTIONS
-from .ig_typing import (BrowserName, CarouselMedia, Comments, Edge, HighlightsTray, MediaInfo,
-                        MediaInfoItem, MediaInfoItemImageVersions2Candidate, WebProfileInfo)
-from .utils import chdir, get_extension, json_dumps_formatted, write_if_new
+from .constants import API_HEADERS, SHARED_HEADERS
+from .typing import (
+    CarouselMedia,
+    Comments,
+    Edge,
+    HighlightsTray,
+    MediaInfo,
+    MediaInfoItem,
+    MediaInfoItemImageVersions2Candidate,
+)
+from .utils import get_extension, json_dumps_formatted, write_if_new
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Mapping
+    from types import TracebackType
-__all__ = ('InstagramClient',)
+    from .typing import BrowserName
+__all__ = ('CSRFTokenNotFound', 'InstagramClient', 'UnexpectedRedirect')
 T = TypeVar('T')
+log = logging.getLogger(__name__)
-def _clean_url(url: str) -> str:
-    parsed = urlparse(url)
-    return f'https://{parsed.netloc}{parsed.path}'
+class CSRFTokenNotFound(RuntimeError):
+    """CSRF token not found in cookies."""
-class AuthenticationError(Exception):
-    pass
+class UnexpectedRedirect(RuntimeError):
+    """Unexpected redirect in a request."""
 class InstagramClient:
-    """The client."""
-    def __init__(self,
-                 *,
-                 username: str,
-                 log_file: str | Path | None = None,
-                 output_dir: str | Path | None = None,
-                 disable_log: bool = False,
-                 browser: BrowserName = 'chrome',
-                 browser_profile: str = 'Default',
-                 debug: bool = False,
-                 comments: bool = False) -> None:
-        self._no_log = disable_log
-        self._session = requests.Session()
-        self._browser = browser
-        self._browser_profile = browser_profile
-        self._setup_session(browser, browser_profile)
-        self._output_dir = Path(output_dir or Path('.').resolve() / username)
-        makedirs(self._output_dir, exist_ok=True)
-        self._log_db = Path(log_file or self._output_dir / '.log.db')
-        self._connection = sqlite3.connect(self._log_db)
-        self._cursor = self._connection.cursor()
-        self._setup_db()
-        self._username = username
-        self._video_urls: list[str] = []
-        self._debug = debug
-        self._get_comments = comments
-    def _add_video_url(self, url: str) -> None:
-        logger.debug(f'Added video URL: {url}')
-        self._video_urls.append(url)
-    def _setup_db(self) -> None:
-        existed = self._log_db.exists()
-        if not existed or (existed and self._log_db.stat().st_size == 0):
-            logger.debug('Creating schema')
-            self._cursor.execute(LOG_SCHEMA)
-    def _setup_session(self,
-                       browser: BrowserName = 'chrome',
-                       browser_profile: str = 'Default') -> None:
-        self._session.mount(
-            'https://',
-            HTTPAdapter(max_retries=Retry(
-                backoff_factor=1.5,  # wait times are normally 1 and 3 seconds
-                redirect=0,
-                status=0,
-                respect_retry_after_header=False,
-                status_forcelist=frozenset((413, 429, 500, 502, 503, 504)),
-                total=RETRY_ABORT_NUM)))
-        self._session.headers.update({
-            **SHARED_HEADERS,
-            **dict(cookie='; '.join(f'{cookie.name}={cookie.value}' \
-                for cookie in extract_cookies_from_browser(browser, browser_profile)
-                    if 'instagram.com' in cookie.domain))
-        })
-        r = self._get_rate_limited('https://www.instagram.com', return_json=False)
-        m = re.search(r'"config":{"csrf_token":"([^"]+)"', r.text)
-        assert m is not None
-        self._session.headers.update({'x-csrftoken': m.group(1)})
-    def _save_to_log(self, url: str) -> None:
-        if self._no_log:
-            return
-        self._cursor.execute('INSERT INTO log (url) VALUES (?)', (_clean_url(url),))
-        self._connection.commit()
-    def _is_saved(self, url: str) -> bool:
-        if self._no_log:
-            return False
-        self._cursor.execute('SELECT COUNT(url) FROM log WHERE url = ?', (_clean_url(url),))
-        count: int
-        count, = self._cursor.fetchone()
-        return count == 1
-    def _save_image_versions2(self, sub_item: CarouselMedia | MediaInfoItem,
-                              timestamp: int) -> None:
+    """Generic client for Instagram."""
+    def __init__(self, browser: BrowserName = 'chrome', browser_profile: str = 'Default') -> None:
+        """
+        Initialise the client.
+        Parameters
+        ----------
+        browser : str
+            The browser to use.
+        browser_profile : str
+            The browser profile to use.
+        """
+        self.session = setup_session(browser,
+                                     browser_profile,
+                                     SHARED_HEADERS,
+                                     domains={'instagram.com'},
+                                     status_forcelist=(413, 429, 500, 502, 503, 504))
+        self.failed_urls: set[str] = set()
+        """Set of failed URLs."""
+        self.video_urls: list[str] = []
+        """List of video URLs to download."""
+    def add_video_url(self, url: str) -> None:
+        """Add a video URL to the list of video URLs."""
+        log.info('Added video URL: %s', url)
+        self.video_urls.append(url)
+    def add_csrf_token_header(self) -> None:
+        """
+        Add CSRF token header to the session.
+        Raises
+        ------
+        CSRFTokenNotFound
+            If the CSRF token is not found in the cookies.
+        """
+        token = self.session.cookies.get('csrftoken')
+        if not token:
+            raise CSRFTokenNotFound
+        self.session.headers.update({'x-csrftoken': token})
+    def graphql_query(self,
+                      variables: Mapping[str, Any],
+                      *,
+                      cast_to: type[T],
+                      doc_id: str = '9806959572732215') -> T | None:
+        """Make a GraphQL query."""
+        with self.session.post('https://www.instagram.com/graphql/query',
+                               headers={
+                                   'content-type': 'application/x-www-form-urlencoded',
+                               } | API_HEADERS,
+                               data={
+                                   'doc_id': doc_id,
+                                   'variables': json.dumps(variables, separators=(',', ':'))
+                               }) as r:
+            if r.status_code != HTTPStatus.OK:
+                return None
+            data = r.json()
+            assert isinstance(data, dict)
+            if (status := data.get('status')) != 'ok':
+                log.error('GraphQL status not "ok": %s', status)
+                return None
+            if data.get('errors'):
+                log.warning('Response has errors.')
+                log.debug('Response: %s', json.dumps(data, indent=2))
+            if not data.get('data'):
+                log.error('No data in response.')
+            return cast('T', data['data'])
+    def get_text(self, url: str, *, params: Mapping[str, str] | None = None) -> str:
+        """Get text from a URL."""
+        with self.session.get(url, params=params, headers=API_HEADERS) as r:
+            r.raise_for_status()
+            return r.text
+    def highlights_tray(self, user_id: int | str) -> HighlightsTray:
+        """Get the highlights tray data for a user."""
+        return self.get_json(
+            f'https://i.instagram.com/api/v1/highlights/{user_id}/highlights_tray/',
+            cast_to=HighlightsTray)
+    def __enter__(self) -> Self:  # pragma: no cover
+        """Recommended way to initialise the client."""
+        return self
+    def __exit__(self, _: type[BaseException] | None, __: BaseException | None,
+                 ___: TracebackType | None) -> None:
+        """Clean up."""
+    def is_saved(self, url: str) -> bool:  # pragma: no cover
+        """Check if a URL is already saved."""
+        return False
+    def save_to_log(self, url: str) -> None:
+        """Save a URL to the log."""
+    def save_image_versions2(self, sub_item: CarouselMedia | MediaInfoItem, timestamp: int) -> None:
+        """Save images in the image_versions2 dictionary."""
         def key(x: MediaInfoItemImageVersions2Candidate) -> int:
             return x['width'] * x['height']
-        best = sorted(sub_item['image_versions2']['candidates'], key=key, reverse=True)[0]
-        if self._is_saved(best['url']):
+        best = max(sub_item['image_versions2']['candidates'], key=key)
+        if self.is_saved(best['url']):
+            return
+        r = self.session.head(best['url'])
+        if r.status_code != HTTPStatus.OK:
+            log.warning('HEAD request failed with status code %s.', r.status_code)
             return
-        r = self._session.head(best['url'])
-        r.raise_for_status()
         ext = get_extension(r.headers['content-type'])
         name = f'{sub_item["id"]}.{ext}'
-        with open(name, 'wb') as f:
-            for content in (self._session.get(best['url'],
-                                              stream=True).iter_content(chunk_size=512)):
-                f.write(content)
+        with Path(name).open('wb') as f:
+            f.writelines(self.session.get(best['url'], stream=True).iter_content(chunk_size=512))
         utime(name, (timestamp, timestamp))
-        self._save_to_log(r.url)
-    def _save_comments(self, edge: Edge) -> None:
-        if self._get_comments:
-            comment_url = ('https://www.instagram.com/api/v1/media/'
-                           f'{edge["node"]["id"]}/comments/')
-            shared_params = dict(can_support_threading='true')
-            top_comment_data = comment_data = self._get_rate_limited(
-                comment_url,
-                params={
-                    **shared_params, 'permalink_enabled': 'false'
-                },
-                cast_to=Comments)
-            while comment_data['can_view_more_preview_comments'] and comment_data['next_min_id']:
-                comment_data = self._get_rate_limited(comment_url,
-                                                      params={
-                                                          **shared_params,
-                                                          'min_id':
-                                                              comment_data['next_min_id'],
-                                                      },
-                                                      cast_to=Comments)
-                top_comment_data['comments'].extend(comment_data['comments'])
-            comments_json = f'{edge["node"]["id"]}-comments.json'
-            with open(comments_json, 'w+') as f:
-                json.dump(top_comment_data, f, sort_keys=True, indent=2)
-    def _save_media(self, edge: Edge) -> None:
-        media_info_url = ('https://i.instagram.com/api/v1/media/'
-                          f'{edge["node"]["id"]}/info/')
-        if self._is_saved(media_info_url):
+        self.save_to_log(r.url)
+    def save_comments(self, edge: Edge) -> None:
+        """Save comments for an edge node."""
+        comment_url = ('https://www.instagram.com/api/v1/media/'
+                       f'{edge["node"]["id"]}/comments/')
+        shared_params = {'can_support_threading': 'true'}
+        try:
+            comment_data = self.get_json(comment_url,
+                                         params={
+                                             **shared_params, 'permalink_enabled': 'false'
+                                         },
+                                         cast_to=Comments)
+        except HTTPError:
+            log.exception('Failed to get comments.')
+            return
+        top_comment_data: Any = comment_data
+        while comment_data['can_view_more_preview_comments'] and comment_data['next_min_id']:
+            try:
+                comment_data = self.get_json(comment_url,
+                                             params={
+                                                 **shared_params,
+                                                 'min_id':
+                                                     comment_data['next_min_id'],
+                                             },
+                                             cast_to=Comments)
+            except HTTPError:
+                log.exception('Failed to get comments.')
+                break
+            top_comment_data['comments'] = (list(top_comment_data['comments']) +
+                                            list(comment_data['comments']))
+        comments_json = f'{edge["node"]["id"]}-comments.json'
+        with Path(comments_json).open('w+', encoding='utf-8') as f:
+            json.dump(top_comment_data, f, sort_keys=True, indent=2)
+    def save_media(self, edge: Edge) -> None:
+        """
+        Save media for an edge node.
+        Raises
+        ------
+        UnexpectedRedirect
+            If a redirect occurs unexpectedly.
+        """
+        media_info_url = f'https://www.instagram.com/api/v1/media/{edge["node"]["pk"]}/info/'
+        log.info('Saving media at URL: %s', media_info_url)
+        if self.is_saved(media_info_url):
+            return
+        r = self.session.get(media_info_url, headers=API_HEADERS, allow_redirects=False)
+        if r.status_code != HTTPStatus.OK:
+            if r.status_code in {HTTPStatus.MOVED_PERMANENTLY, HTTPStatus.FOUND}:
+                raise UnexpectedRedirect
+            log.warning('GET request failed with status code %s.', r.status_code)
+            log.debug('Content: %s', r.text)
             return
-        media_info = self._get_rate_limited(media_info_url, cast_to=MediaInfo)
-        if media_info['more_available'] or media_info['num_results'] != 1:
-            pp(media_info)
-            raise ValueError('Unhandled more_available set to True')
+        if 'image_versions2' not in r.text or 'taken_at' not in r.text:
+            log.warning('Invalid response. image_versions2 dict not found.')
+            return
+        media_info: MediaInfo = r.json()
         timestamp = media_info['items'][0]['taken_at']
         id_json_file = f'{edge["node"]["id"]}.json'
         media_info_json_file = f'{edge["node"]["id"]}-media-info-0000.json'
@@ -170,141 +225,48 @@ class InstagramClient:
         write_if_new(media_info_json_file, str(json_dumps_formatted(media_info)))
         for file in (id_json_file, media_info_json_file):
             utime(file, (timestamp, timestamp))
-        self._save_to_log(media_info_url)
+        self.save_to_log(media_info_url)
         for item in media_info['items']:
             timestamp = item['taken_at']
-            if 'carousel_media' in item:
-                for sub_item in item['carousel_media']:
-                    self._save_image_versions2(sub_item, timestamp)
+            if (carousel_media := item.get('carousel_media')):
+                for sub_item in carousel_media:
+                    self.save_image_versions2(sub_item, timestamp)
             elif 'image_versions2' in item:
-                self._save_image_versions2(item, timestamp)
+                self.save_image_versions2(item, timestamp)
-    def _save_stuff(self, edges: Collection[Edge], parent_edge: Edge | None = None) -> None:
+    def save_edges(self, edges: Iterable[Edge], parent_edge: Edge | None = None) -> None:
+        """Save edge node media."""
         for edge in edges:
-            if edge['node']['__typename'] == 'GraphVideo':
+            if edge['node']['__typename'] == 'XDTMediaDict':
                 try:
-                    shortcode = edge['node']['shortcode']
-                except KeyError as e:
+                    shortcode = edge['node']['code']
+                except KeyError:
                     if parent_edge:
                         try:
-                            shortcode = parent_edge['node']['shortcode']
-                        except KeyError as exc:
-                            raise ValueError('Unknown shortcode') from exc
+                            shortcode = parent_edge['node']['code']
+                        except KeyError:
+                            log.exception('Unknown shortcode.')
+                            return
                     else:
-                        raise ValueError('Unknown shortcode') from e
-                self._add_video_url(f'https://www.instagram.com/p/{shortcode}')
-            elif edge['node']['__typename'] == 'GraphImage':
-                self._save_media(edge)
-            elif edge['node']['__typename'] == 'GraphSidecar':
-                logger.debug('Recursion into child edges')
-                if (not edge['node']['comments_disabled']
-                        and edge['node']['edge_media_to_comment']['count']):
-                    self._save_comments(edge)
-                self._save_stuff(edge['node']['edge_sidecar_to_children']['edges'], edge)
+                        log.exception('Unknown shortcode.')
+                if edge['node'].get('video_dash_manifest'):
+                    self.add_video_url(f'https://www.instagram.com/p/{shortcode}/')
+                else:
+                    try:
+                        self.save_comments(edge)
+                        self.save_media(edge)
+                    except requests.exceptions.RetryError:
+                        log.exception('Retries exhausted.')
+                        return
             else:
-                raise ValueError(f'Unknown type "{edge["node"]["__typename"]}"')
-    @overload
-    def _get_rate_limited(self, url: str, *, cast_to: Type[T]) -> T:
-        pass
-    @overload
-    def _get_rate_limited(self,
-                          url: str,
-                          *,
-                          return_json: Literal[False] = False) -> requests.Response:
-        pass
-    @overload
-    def _get_rate_limited(self,
-                          url: str,
-                          *,
-                          params: Mapping[str, str] | None = None,
-                          cast_to: Type[T]) -> T:
-        pass
-    @sleep_and_retry
-    @limits(calls=10, period=60)
-    def _get_rate_limited(
-            self,
-            url: str,
-            *,
-            return_json: bool = True,
-            params: Mapping[str, str] | None = None,
-            cast_to: Type[T] | None = None) -> T | requests.Response:  # pylint: disable=unused-argument
-        with self._session.get(url, params=params) as r:
-            r.raise_for_status()
-            return r.json() if return_json else r
+                log.warning(  # type: ignore[unreachable]
+                    'Unknown type: `%s`. Item %s will not be processed.',
+                    edge['node']['__typename'], edge['node']['id'])
+                shortcode = edge['node']['code']
+                self.failed_urls.add(f'https://www.instagram.com/p/{shortcode}/')
-    def _highlights_tray(self, user_id: int | str) -> HighlightsTray:
-        return self._get_rate_limited(
-            f'https://i.instagram.com/api/v1/highlights/{user_id}/'
-            'highlights_tray/',
-            cast_to=HighlightsTray)
-    def __enter__(self) -> 'InstagramClient':
-        """Recommended way to initialise the client."""
-        return self
-    def __exit__(self, _: Type[BaseException], __: BaseException, ___: Traceback) -> None:
-        """Clean up."""
-        self._cursor.close()
-        self._connection.close()
-    def process(self) -> None:
-        """Process posts."""
-        with chdir(self._output_dir):
-            self._get_rate_limited(f'https://www.instagram.com/{self._username}/',
-                                   return_json=False)
-            r = self._get_rate_limited('https://i.instagram.com/api/v1/users/web_profile_info/',
-                                       params={'username': self._username},
-                                       cast_to=WebProfileInfo)
-            with open('web_profile_info.json', 'w') as f:
-                json.dump(r, f, indent=2, sort_keys=True)
-            user_info = r['data']['user']
-            if not self._is_saved(user_info['profile_pic_url_hd']):
-                with open('profile_pic.jpg', 'wb') as f:
-                    for chunk in self._session.get(user_info['profile_pic_url_hd'],
-                                                   stream=True).iter_content(chunk_size=512):
-                        f.write(chunk)
-                self._save_to_log(user_info['profile_pic_url_hd'])
-            for item in self._highlights_tray(user_info['id'])['tray']:
-                self._add_video_url('https://www.instagram.com/stories/highlights/'
-                                    f'{item["id"].split(":")[-1]}/')
-            self._save_stuff(user_info['edge_owner_to_timeline_media']['edges'])
-            page_info = user_info['edge_owner_to_timeline_media']['page_info']
-            while page_info['has_next_page']:
-                params = dict(query_hash='69cba40317214236af40e7efa697781d',
-                              variables=json.dumps(
-                                  dict(id=user_info['id'], first=12,
-                                       after=page_info['end_cursor'])))
-                media = self._get_rate_limited(
-                    'https://www.instagram.com/graphql/query/',
-                    params=params,
-                    cast_to=WebProfileInfo)['data']['user']['edge_owner_to_timeline_media']
-                page_info = media['page_info']
-                self._save_stuff(media['edges'])
-            if len(self._video_urls) > 0:
-                options = deepcopy(SHARED_YT_DLP_OPTIONS)
-                options.update({
-                    'cookiefile': None,
-                    'cookiesfrombrowser': (self._browser, self._browser_profile),
-                    'getcomments': self._get_comments,
-                    'verbose': self._debug
-                })
-                with yt_dlp.YoutubeDL(options) as ydl:
-                    failed_urls: list[str] = []
-                    while (self._video_urls and (url := self._video_urls.pop())):
-                        if self._is_saved(url):
-                            logger.debug(f'{url} is already saved')
-                            continue
-                        if ydl.extract_info(url):
-                            logger.debug(f'Extracting {url}')
-                            self._save_to_log(url)
-                        else:
-                            failed_urls.append(url)
-                    if len(failed_urls) > 0:
-                        logger.error('Some video URIs failed. Check failed.txt.')
-                        with open('failed.txt', 'w') as f:
-                            for url in failed_urls:
-                                f.write(f'{url}\n')
+    def get_json(self, url: str, *, cast_to: type[T], params: Mapping[str, str] | None = None) -> T:
+        """Get JSON data from a URL."""
+        with self.session.get(url, params=params, headers=API_HEADERS) as r:
+            r.raise_for_status()
+            return cast('T', r.json())

instagram-archiver 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

instagram-archiver 0.2.1py3-none-any.whl → 0.3.1py3-none-any.whl