PyPI - instagram-archiver - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

instagram-archiver 0.2.1py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of instagram-archiver might be problematic. Click here for more details.

Files changed (20) hide show

instagram_archiver/__init__.py +8 -2
instagram_archiver/__main__.py +4 -1
instagram_archiver/client.py +232 -270
instagram_archiver/constants.py +51 -95
instagram_archiver/main.py +80 -57
instagram_archiver/profile_scraper.py +194 -0
instagram_archiver/saved_scraper.py +79 -0
instagram_archiver/typing.py +172 -0
instagram_archiver/utils.py +97 -79
instagram_archiver-0.3.1.dist-info/LICENSE.txt +18 -0
instagram_archiver-0.3.1.dist-info/METADATA +119 -0
instagram_archiver-0.3.1.dist-info/RECORD +15 -0
{instagram_archiver-0.2.1.dist-info → instagram_archiver-0.3.1.dist-info}/WHEEL +1 -1
instagram_archiver-0.3.1.dist-info/entry_points.txt +4 -0
instagram_archiver/find_query_hashes.py +0 -31
instagram_archiver/ig_typing.py +0 -173
instagram_archiver-0.2.1.dist-info/LICENSE.txt +0 -21
instagram_archiver-0.2.1.dist-info/METADATA +0 -44
instagram_archiver-0.2.1.dist-info/RECORD +0 -14
instagram_archiver-0.2.1.dist-info/entry_points.txt +0 -3

instagram_archiver/constants.py CHANGED Viewed

@@ -1,107 +1,63 @@
-from typing import TYPE_CHECKING, Final, Mapping
+"""Constants."""
+from __future__ import annotations
-from .utils import YoutubeDLLogger
+__all__ = ('API_HEADERS', 'BROWSER_CHOICES', 'PAGE_FETCH_HEADERS', 'SHARED_HEADERS', 'USER_AGENT')
-__all__ = ('BROWSER_CHOICES', 'LOG_SCHEMA', 'RETRY_ABORT_NUM', 'SHARED_HEADERS',
-           'SHARED_YT_DLP_OPTIONS', 'YT_DLP_SLEEP_INTERVAL', 'USER_AGENT')
+USER_AGENT = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
+              'Chrome/137.0.0.0 Safari/537.36')
+"""
+User agent.
-if TYPE_CHECKING:
-    from yt_dlp import YDLOpts
-USER_AGENT: Final[str] = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
-                          'Chrome/112.0.0.0 Safari/537.36')
-# Do not set the x-ig-d header as this will cause API calls to return 404
-SHARED_HEADERS: Final[Mapping[str, str]] = {
-    'accept': ('text/html,application/xhtml+xml,application/xml;q=0.9,image/jxl,'
-               'image/avif,image/webp,image/apng,*/*;q=0.8,'
-               'application/signed-exchange;v=b3;q=0.9'),
-    'accept-language': 'en,en-GB;q=0.9,en-US;q=0.8',
+:meta hide-value:
+"""
+SHARED_HEADERS = {
+    'accept': '*/*',
     'authority': 'www.instagram.com',
     'cache-control': 'no-cache',
     'dnt': '1',
     'pragma': 'no-cache',
-    'referer': 'https://www.instagram.com',
-    'upgrade-insecure-requests': '1',
     'user-agent': USER_AGENT,
-    'viewport-width': '2560',
-    'x-ig-app-id': '936619743392459'
+    # 'x-asbd-id': '359341',
+    # 'x-ig-app-id': '936619743392459',
+}
+"""
+Headers to use for requests.
+:meta hide-value:
+"""
+API_HEADERS = {
+    'x-asbd-id': '359341',
+    'x-ig-app-id': '936619743392459',
+}
+"""
+Headers to use for API requests.
+:meta hide-value:
+"""
+PAGE_FETCH_HEADERS = {
+    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,'
+              'image/apng,*/*;q=0.8',
+    'dpr': '1.5',
+    'sec-fetch-mode': 'navigate',  # Definitely required.
+    'viewport-width': '3840',
 }
-LOG_SCHEMA: Final[str] = '''
-CREATE TABLE log (
+"""
+Headers to use for fetching HTML pages.
+:meta hide-value:
+"""
+LOG_SCHEMA = """CREATE TABLE log (
     url TEXT PRIMARY KEY NOT NULL,
     date TEXT DEFAULT CURRENT_TIMESTAMP NOT NULL
-);
-'''
-#: Calls per minute allowed.
-CALLS_PER_MINUTE: Final[int] = 10
-#: yt-dlp sleep interval.
-YT_DLP_SLEEP_INTERVAL: Final[int] = 60 // CALLS_PER_MINUTE
-#: Value taken from Instagram's JS under BootloaderConfig
-RETRY_ABORT_NUM: Final[int] = 2
-SHARED_YT_DLP_OPTIONS: 'YDLOpts' = {
-    'allowed_extractors': ['Instagram.*'],
-    'allsubtitles': True,
-    'cookiesfrombrowser': None,
-    'geo_bypass': True,
-    'getcomments': False,
-    'hls_use_mpegts': True,
-    'http_headers': SHARED_HEADERS,
-    'ignore_no_formats_error': True,
-    'ignoreerrors': True,
-    'logger': YoutubeDLLogger(),
-    'outtmpl': {
-        'default': '%(title).128s___src=%(extractor)s___id=%(id)s.%(ext)s',
-        'pl_thumbnail': ''
-    },
-    'overwrites': False,
-    'max_sleep_interval': 6,
-    'merge_output_format': 'mkv',
-    'postprocessors': [{
-        'api': 'https://sponsor.ajay.app',
-        'categories': [
-            'preview', 'selfpromo', 'interaction', 'music_offtopic', 'sponsor', 'poi_highlight',
-            'intro', 'outro', 'filler', 'chapter'
-        ],
-        'key': 'SponsorBlock',
-        'when': 'after_filter'
-    }, {
-        'format': 'srt',
-        'key': 'FFmpegSubtitlesConvertor',
-        'when': 'before_dl'
-    }, {
-        'already_have_subtitle': True,
-        'key': 'FFmpegEmbedSubtitle'
-    }, {
-        'force_keyframes': False,
-        'key': 'ModifyChapters',
-        'remove_chapters_patterns': [],
-        'remove_ranges': [],
-        'remove_sponsor_segments': [],
-        'sponsorblock_chapter_title': '[SponsorBlock]: %(category_names)'
-    }, {
-        'add_chapters': True,
-        'add_infojson': 'if_exists',
-        'add_metadata': True,
-        'key': 'FFmpegMetadata'
-    }, {
-        'already_have_thumbnail': False,
-        'key': 'EmbedThumbnail'
-    }, {
-        'key': 'FFmpegConcat',
-        'only_multi_video': True,
-        'when': 'playlist'
-    }],
-    'restrictfilenames': True,
-    'skip_unavailable_fragments': True,
-    'sleep_interval': YT_DLP_SLEEP_INTERVAL,
-    'sleep_interval_requests': YT_DLP_SLEEP_INTERVAL,
-    'sleep_interval_subtitles': YT_DLP_SLEEP_INTERVAL,
-    'subtitleslangs': ['all'],
-    'verbose': False,
-    'writeautomaticsub': True,
-    'writesubtitles': True,
-    'writeinfojson': True,
-    'writethumbnail': True,
-}
-#: Possible browser choices to get cookies from.
+);"""
+"""
+Schema for log database.
+:meta hide-value:
+"""
 BROWSER_CHOICES = ('brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'firefox', 'safari')
+"""
+Possible browser choices to get cookies from.
+:meta hide-value:
+"""

instagram_archiver/main.py CHANGED Viewed

@@ -1,82 +1,105 @@
+"""Main application."""
+from __future__ import annotations
 from pathlib import Path
-import sys
+from typing import TYPE_CHECKING
-from loguru import logger
-from requests.exceptions import RetryError
 import click
-from .client import AuthenticationError, InstagramClient
+from .client import UnexpectedRedirect
 from .constants import BROWSER_CHOICES
-from .find_query_hashes import find_query_hashes
-from .ig_typing import BrowserName
+from .profile_scraper import ProfileScraper
+from .saved_scraper import SavedScraper
 from .utils import setup_logging
+if TYPE_CHECKING:
+    from .typing import BrowserName
 __all__ = ('main',)
-@click.command()
+@click.command(context_settings={'help_option_names': ('-h', '--help')})
 @click.option('-o',
               '--output-dir',
-              default=None,
-              help='Output directory',
-              type=click.Path(file_okay=False, path_type=Path, resolve_path=True, writable=True))
+              default='%(username)s',
+              help='Output directory.',
+              type=click.Path(file_okay=False, writable=True))
 @click.option('-b',
               '--browser',
               default='chrome',
               type=click.Choice(BROWSER_CHOICES),
-              help='Browser to read cookies from')
-@click.option('-p', '--profile', default='Default', help='Browser profile')
-@click.option('-d', '--debug', is_flag=True, help='Enable debug output')
-@click.option('--no-log', is_flag=True, help='Ignore log (re-fetch everything)')
+              help='Browser to read cookies from.')
+@click.option('-p', '--profile', default='Default', help='Browser profile.')
+@click.option('-d', '--debug', is_flag=True, help='Enable debug output.')
+@click.option('--no-log', is_flag=True, help='Ignore log (re-fetch everything).')
 @click.option('-C',
               '--include-comments',
               is_flag=True,
               help='Also download all comments (extends download time significantly).')
-@click.option('--print-query-hashes',
-              is_flag=True,
-              help='Print current query hashes and exit.',
-              hidden=True)
-@click.argument('username', required=False)
-def main(output_dir: Path | None,
-         browser: BrowserName,
-         profile: str,
+@click.argument('username')
+def main(output_dir: str,
+         username: str,
+         browser: BrowserName = 'chrome',
+         profile: str = 'Default',
+         *,
          debug: bool = False,
          include_comments: bool = False,
-         no_log: bool = False,
-         print_query_hashes: bool = False,
-         username: str | None = None) -> None:
-    """Archive a profile's posts."""
-    setup_logging(debug)
-    if print_query_hashes:
-        for query_hash in sorted(find_query_hashes(browser, profile)):
-            click.echo(query_hash)
-        return
-    if not username:
-        raise click.UsageError('Username is required')
+         no_log: bool = False) -> None:
+    """Archive a profile's posts."""  # noqa: DOC501
+    setup_logging(debug=debug)
     try:
-        with InstagramClient(browser=browser,
-                             browser_profile=profile,
-                             comments=include_comments,
-                             debug=debug,
-                             disable_log=no_log,
-                             output_dir=output_dir,
-                             username=username) as client:
+        with ProfileScraper(browser=browser,
+                            browser_profile=profile,
+                            comments=include_comments,
+                            disable_log=no_log,
+                            output_dir=(Path(output_dir % {'username': username})
+                                        if '%(username)s' in output_dir else Path(output_dir)),
+                            username=username) as client:
             client.process()
-    except RetryError as e:
-        click.echo(
-            'Open your browser and login if necessary. If you are logged in and this continues, '
-            'try waiting at least 12 hours.',
-            file=sys.stderr)
-        raise click.Abort() from e
-    except AuthenticationError as e:
-        click.echo(
-            'You are probably not logged into Instagram in this browser profile or your '
-            'session has expired.',
-            file=sys.stderr)
-        raise click.Abort() from e
+    except UnexpectedRedirect as e:
+        click.echo('Unexpected redirect. Assuming request limit has been reached.', err=True)
+        raise click.Abort from e
+    except Exception as e:
+        if isinstance(e, KeyboardInterrupt) or debug:
+            raise
+        click.echo('Run with --debug for more information.', err=True)
+        raise click.Abort from e
+@click.command(context_settings={'help_option_names': ('-h', '--help')})
+@click.option('-o',
+              '--output-dir',
+              default='.',
+              help='Output directory.',
+              type=click.Path(file_okay=False, writable=True))
+@click.option('-b',
+              '--browser',
+              default='chrome',
+              type=click.Choice(BROWSER_CHOICES),
+              help='Browser to read cookies from.')
+@click.option('-p', '--profile', default='Default', help='Browser profile.')
+@click.option('-d', '--debug', is_flag=True, help='Enable debug output.')
+@click.option('-C',
+              '--include-comments',
+              is_flag=True,
+              help='Also download all comments (extends download time significantly).')
+@click.option('-u', '--unsave', is_flag=True, help='Unsave posts after successful archive.')
+def save_saved_main(output_dir: str,
+                    browser: BrowserName = 'chrome',
+                    profile: str = 'Default',
+                    *,
+                    debug: bool = False,
+                    include_comments: bool = False,
+                    unsave: bool = False) -> None:
+    """Archive your saved posts."""  # noqa: DOC501
+    setup_logging(debug=debug)
+    try:
+        SavedScraper(browser, profile, output_dir, comments=include_comments).process(unsave=unsave)
+    except UnexpectedRedirect as e:
+        click.echo('Unexpected redirect. Assuming request limit has been reached.', err=True)
+        raise click.Abort from e
     except Exception as e:
-        if debug:
-            logger.exception(e)
-        else:
-            click.echo('Run with --debug for more information')
-        raise click.Abort(f'{e} (run with --debug for more information)') from e
+        if isinstance(e, KeyboardInterrupt) or debug:
+            raise
+        click.echo('Run with --debug for more information.', err=True)
+        raise click.Abort from e

instagram_archiver/profile_scraper.py ADDED Viewed

@@ -0,0 +1,194 @@
+"""Instagram client."""
+from __future__ import annotations
+from contextlib import chdir
+from pathlib import Path
+from typing import TYPE_CHECKING, TypeVar, override
+from urllib.parse import urlparse
+import json
+import logging
+import sqlite3
+from requests import HTTPError
+from yt_dlp_utils import get_configured_yt_dlp
+from .client import InstagramClient
+from .constants import LOG_SCHEMA
+from .typing import (
+    BrowserName,
+    WebProfileInfo,
+    XDTAPIV1FeedUserTimelineGraphQLConnectionContainer,
+)
+from .utils import SaveCommentsCheckDisabledMixin
+if TYPE_CHECKING:
+    from types import TracebackType
+__all__ = ('ProfileScraper',)
+T = TypeVar('T')
+log = logging.getLogger(__name__)
+def _clean_url(url: str) -> str:
+    parsed = urlparse(url)
+    return f'https://{parsed.netloc}{parsed.path}'
+class ProfileScraper(SaveCommentsCheckDisabledMixin, InstagramClient):
+    """The scraper."""
+    def __init__(self,
+                 username: str,
+                 *,
+                 log_file: str | Path | None = None,
+                 output_dir: str | Path | None = None,
+                 disable_log: bool = False,
+                 browser: BrowserName = 'chrome',
+                 browser_profile: str = 'Default',
+                 comments: bool = False) -> None:
+        """
+        Initialise ``ProfileScraper``.
+        Parameters
+        ----------
+        username : str
+            The username to scrape.
+        log_file : str | Path | None
+            The log file to use.
+        output_dir : str | Path | None
+            The output directory to save the posts to.
+        disable_log : bool
+            Whether to disable logging or not.
+        browser : BrowserName
+            The browser to use.
+        browser_profile : str
+            The browser profile to use.
+        comments : bool
+            Whether to save comments or not.
+        """
+        super().__init__(browser, browser_profile)
+        self._no_log = disable_log
+        self._output_dir = Path(output_dir or Path.cwd() / username)
+        self._output_dir.mkdir(parents=True, exist_ok=True)
+        self._log_db = Path(log_file or self._output_dir / '.log.db')
+        self._connection = sqlite3.connect(self._log_db)
+        self._cursor = self._connection.cursor()
+        self._setup_db()
+        self._username = username
+        self.should_save_comments = comments
+    def _setup_db(self) -> None:
+        if self._no_log:
+            return
+        existed = self._log_db.exists()
+        if not existed or (existed and self._log_db.stat().st_size == 0):
+            log.debug('Creating schema.')
+            self._cursor.execute(LOG_SCHEMA)
+    @override
+    def save_to_log(self, url: str) -> None:
+        if self._no_log:
+            return
+        self._cursor.execute('INSERT INTO log (url) VALUES (?)', (_clean_url(url),))
+        self._connection.commit()
+    @override
+    def is_saved(self, url: str) -> bool:
+        if self._no_log:
+            return False
+        self._cursor.execute('SELECT COUNT(url) FROM log WHERE url = ?', (_clean_url(url),))
+        count: int
+        count, = self._cursor.fetchone()
+        return count == 1
+    @override
+    def __exit__(self, _: type[BaseException] | None, __: BaseException | None,
+                 ___: TracebackType | None) -> None:
+        """Clean up."""
+        self._cursor.close()
+        self._connection.close()
+    def process(self) -> None:
+        """Process posts."""
+        with chdir(self._output_dir):
+            self.get_text(f'https://www.instagram.com/{self._username}/')
+            self.add_csrf_token_header()
+            r = self.get_json('https://i.instagram.com/api/v1/users/web_profile_info/',
+                              params={'username': self._username},
+                              cast_to=WebProfileInfo)
+            with Path('web_profile_info.json').open('w', encoding='utf-8') as f:
+                json.dump(r, f, indent=2, sort_keys=True)
+            user_info = r['data']['user']
+            if not self.is_saved(user_info['profile_pic_url_hd']):
+                with Path('profile_pic.jpg').open('wb') as f:
+                    f.writelines(
+                        self.session.get(user_info['profile_pic_url_hd'],
+                                         stream=True).iter_content(chunk_size=512))
+                self.save_to_log(user_info['profile_pic_url_hd'])
+            try:
+                for item in self.highlights_tray(user_info['id'])['tray']:
+                    self.add_video_url('https://www.instagram.com/stories/highlights/'
+                                       f'{item["id"].split(":")[-1]}/')
+            except HTTPError:
+                log.exception('Failed to get highlights data.')
+            self.save_edges(user_info['edge_owner_to_timeline_media']['edges'])
+            d = self.graphql_query(
+                {
+                    'data': {
+                        'count': 12,
+                        'include_reel_media_seen_timestamp': True,
+                        'include_relationship_info': True,
+                        'latest_besties_reel_media': True,
+                        'latest_reel_media': True
+                    },
+                    'username': self._username,
+                    '__relay_internal__pv__PolarisIsLoggedInrelayprovider': True,
+                    '__relay_internal__pv__PolarisShareSheetV3relayprovider': True,
+                },
+                cast_to=XDTAPIV1FeedUserTimelineGraphQLConnectionContainer)
+            if not d:
+                log.error('First GraphQL query failed.')
+            else:
+                self.save_edges(d['xdt_api__v1__feed__user_timeline_graphql_connection']['edges'])
+                page_info = d['xdt_api__v1__feed__user_timeline_graphql_connection']['page_info']
+                while page_info['has_next_page']:
+                    d = self.graphql_query(
+                        {
+                            'after': page_info['end_cursor'],
+                            'before': None,
+                            'data': {
+                                'count': 12,
+                                'include_reel_media_seen_timestamp': True,
+                                'include_relationship_info': True,
+                                'latest_besties_reel_media': True,
+                                'latest_reel_media': True,
+                            },
+                            'first': 12,
+                            'last': None,
+                            'username': self._username,
+                            '__relay_internal__pv__PolarisIsLoggedInrelayprovider': True,
+                            '__relay_internal__pv__PolarisShareSheetV3relayprovider': True,
+                        },
+                        cast_to=XDTAPIV1FeedUserTimelineGraphQLConnectionContainer)
+                    if not d:
+                        break
+                    page_info = d['xdt_api__v1__feed__user_timeline_graphql_connection'][
+                        'page_info']
+                    self.save_edges(
+                        d['xdt_api__v1__feed__user_timeline_graphql_connection']['edges'])
+            if self.video_urls:
+                with get_configured_yt_dlp() as ydl:
+                    while self.video_urls and (url := self.video_urls.pop()):
+                        if self.is_saved(url):
+                            log.info('`%s` is already saved.', url)
+                            continue
+                        if ydl.extract_info(url):
+                            log.info('Extracting `%s`.', url)
+                            self.save_to_log(url)
+                        else:
+                            self.failed_urls.add(url)
+            if self.failed_urls:
+                log.warning('Some video URIs failed. Check failed.txt.')
+                with Path('failed.txt').open('w', encoding='utf-8') as f:
+                    for url in self.failed_urls:
+                        f.write(f'{url}\n')

instagram_archiver/saved_scraper.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Saved posts scraper."""
+from __future__ import annotations
+from contextlib import chdir
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+import logging
+from .client import InstagramClient
+from .constants import API_HEADERS, PAGE_FETCH_HEADERS
+from .utils import SaveCommentsCheckDisabledMixin
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+    from .typing import BrowserName
+__all__ = ('SavedScraper',)
+log = logging.getLogger(__name__)
+class SavedScraper(SaveCommentsCheckDisabledMixin, InstagramClient):
+    """Scrape saved posts."""
+    def __init__(
+        self,
+        browser: BrowserName = 'chrome',
+        browser_profile: str = 'Default',
+        output_dir: str | Path | None = None,
+        *,
+        comments: bool = False,
+    ) -> None:
+        """
+        Initialise ``SavedScraper``.
+        Parameters
+        ----------
+        browser : BrowserName
+            The browser to use.
+        browser_profile : str
+            The browser profile to use.
+        output_dir : str | Path | None
+            The output directory to save the posts to.
+        comments : bool
+            Whether to save comments or not.
+        """
+        super().__init__(browser, browser_profile)
+        self._output_dir = Path(output_dir or Path.cwd() / '@@saved-posts@@')
+        Path(self._output_dir).mkdir(parents=True, exist_ok=True)
+        self.should_save_comments = comments
+    def unsave(self, items: Iterable[str]) -> None:
+        """Unsave saved posts."""
+        for item in items:
+            log.info('Unsaving %s.', item)
+            self.session.post(f'https://www.instagram.com/web/save/{item}/unsave/',
+                              headers=API_HEADERS)
+    def process(self, *, unsave: bool = False) -> None:
+        """Process the saved posts."""
+        with chdir(self._output_dir):
+            self.add_csrf_token_header()
+            self.session.get('https://www.instagram.com/', headers=PAGE_FETCH_HEADERS)
+            feed = self.get_json('https://www.instagram.com/api/v1/feed/saved/posts/',
+                                 cast_to=dict[str, Any])
+            self.save_edges({
+                'node': {
+                    '__typename': 'XDTMediaDict',
+                    'id': item['media']['id'],
+                    'code': item['media']['code'],
+                    'owner': item['media']['owner'],
+                    'pk': item['media']['pk'],
+                    'video_dash_manifest': item['media'].get('video_dash_manifest')
+                }
+            } for item in feed['items'])
+            if unsave:
+                self.unsave(item['media']['code'] for item in feed['items'])
+            if feed.get('more_available'):
+                log.warning('Unhandled pagination.')

instagram-archiver 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

instagram-archiver 0.2.1py3-none-any.whl → 0.3.1py3-none-any.whl