PyPI - instagram-posts-scraper - Versions diffs - 0.0.2__tar.gz → 0.0.3__tar.gz - Mend

instagram-posts-scraper 0.0.2tar.gz → 0.0.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: instagram-posts-scraper
-Version: 0.0.2
+Version: 0.0.3
 Summary: Implement Instagram Posts Scraper for post data retrieval
 Home-page: https://github.com/FaustRen/instagram-posts-scraper
 Author: FaustRen
@@ -19,6 +19,7 @@ Dynamic: description
 Dynamic: description-content-type
 Dynamic: home-page
 Dynamic: license
+Dynamic: license-file
 Dynamic: requires-python
 Dynamic: summary
@@ -49,6 +50,8 @@ beautifulsoup4==4.13.3
 cloudscraper==1.2.71
 pandas==2.2.3
 pytz==2025.1
+seleniumbase==4.38.0
+requests==2.32.3
 ```
 ## Installation

{instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/instagram_posts_scraper.py RENAMED Viewed

@@ -25,8 +25,8 @@ class InstaPeriodScraper(object):
         )
     def check_account_is_public(self):
-        init_response = self.pixwox_request.send_requests(url=self.scraper.init_url)
-        self.profile_soup = self.parser.get_soup(response=init_response)
+        self.init_response = self.pixwox_request.send_requests(url=self.scraper.init_url)
+        self.profile_soup = self.parser.get_soup(response=self.init_response)
         self.userid = self.parser.get_userid(profile_soup=self.profile_soup)
         self.account_status = get_account_status(userid=self.userid, profile_soup=self.profile_soup)
         return self.account_status == "public"
@@ -227,6 +227,12 @@ class InstaPeriodScraper(object):
         username = self.target_info["username"]
         self.scraper.set_username(username)
         days_limit = target_info["days_limit"]
+        # check if user-agent & cookies are valid first
+        print("# check if user-agent & cookies are valid first")
+        valid_headers_cookies = get_valid_headers_cookies(username=username)
+        self.pixwox_request.set_valid_headers_cookies(valid_headers_cookies=valid_headers_cookies)
         if not self.check_account_is_public():
             print("This is private account")
             if self.account_status == "private":
@@ -238,9 +244,21 @@ class InstaPeriodScraper(object):
                 return res
         if self.check_account_is_public():
-            init_api_data = self.get_init_api_data() # 帳號資訊 & 上方頁面內容
-            self.get_profile()
+            self.scraper_utils = get_scraper_utils(html=self.init_response.text) # new
             print(f"This is public account")
+            # get_scraper_utils
+            init_response = self.pixwox_request.send_requests(url=self.scraper.init_url)
+            # init_api_data = self.get_init_api_data() # 帳號資訊 & 上方頁面內容
+            userid = self.scraper_utils["userid"]
+            username = self.scraper_utils["username"]
+            next_maxid = self.scraper_utils["data_maxid"]
+            next_ = self.scraper_utils["clean_data_next"]
+            next_api = f"https://www.piokok.com/api/posts?username={username}&userid={userid}&next={next_}==&maxid={next_maxid}"
+            init_api_data = self.pixwox_request.send_requests(url=next_api) # actually, this is next..
+            init_api_data = init_api_data.json()
+            self.get_profile()
             # can scrape next round's posts
             if init_api_data["posts"]["has_next"] != False:
                 maxid = init_api_data["posts"]["maxid"]
@@ -256,8 +274,12 @@ class InstaPeriodScraper(object):
                     scraped_posts=period_posts,
                     init_api_data=init_api_data
                     )
+                init_posts = self.parser.extract_init_posts(init_response.text)
+                res["init_posts"] = init_posts
                 return res
             # # no more posts
             elif init_api_data["posts"]["has_next"] == False: # (表示該帳號貼文數<=12, 無法繼續往下找)
                 res = self.get_public_account_res(scraped_posts=init_api_data["posts"]["items"], init_api_data=init_api_data)
+                init_posts = self.parser.extract_init_posts(init_response.text)
+                res["init_posts"] = init_posts
                 return res

{instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/parse.py RENAMED Viewed

@@ -38,6 +38,47 @@ class Parser(object):
             if each_info.text != ' ':
                 user_info_list.append(str(each_info.text))
         return user_info_list
+    @staticmethod
+    def extract_init_posts(html):
+        """
+        Extracts post details (text, likes, comments, time) from HTML content.
+        Args:
+            html (str): The full HTML content of the page.
+        Returns:
+            list[dict]: A list of dictionaries containing post information.
+        """
+        soup = BeautifulSoup(html, "html.parser")
+        # Find all post items under .posts > .items > .item
+        items = soup.select("div.posts div.items div.item")
+        init_posts = []
+        for item in items:
+            post = {}
+            # 1. Post content text
+            sum_div = item.select_one("div.sum")
+            post["text"] = sum_div.text.strip() if sum_div else None
+            # 2. Like count
+            like_span = item.select_one("span.count_item_like span.num")
+            post["likes"] = like_span.text.strip() if like_span else None
+            # 3. Comment count
+            comment_span = item.select_one("span.count_item_comment span.num")
+            post["comments"] = comment_span.text.strip() if comment_span else None
+            # 4. Post date/time
+            time_span = item.select_one("div.time span.txt")
+            post["time"] = time_span.text.strip() if time_span else None
+            init_posts.append(post)
+        return init_posts
     def get_soup(self, response):
         soup = BeautifulSoup(response.text, self.__DEFAULT_SOUP_PARSER)

{instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/request.py RENAMED Viewed

@@ -2,6 +2,7 @@
 import json
 import cloudscraper
 from bs4 import BeautifulSoup
+import requests
 class PixwoxRequest(object):
@@ -14,8 +15,18 @@ class PixwoxRequest(object):
                      "mobile": "False"})
     def send_requests(self, url):
-        response = self.__scraper.get(url)
+        # response = self.__scraper.get(url) # temporary stop this good method  :（
+        response = requests.get(
+            url=url,
+            headers={"User-Agent":self.__user_agent},
+            cookies=self.__cookies
+        )
         return response
+    def set_valid_headers_cookies(self, valid_headers_cookies):
+        self.__valid_headers_cookies = valid_headers_cookies
+        self.__user_agent = self.__valid_headers_cookies[0].get("User-Agent")
+        self.__cookies = self.__valid_headers_cookies[1]
     def get_init_content(self, username: str) -> str:
         get_url = f"https://www.picnob.com/zh-hant/profile/{username}"

instagram_posts_scraper-0.0.3/instagram_posts_scraper/utils/utils.py ADDED Viewed

@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+import concurrent.futures as futures
+from datetime import datetime
+import pytz
+import pandas as pd
+from functools import wraps
+import time
+import os
+import json
+import requests
+from seleniumbase import Driver
+from pathlib import Path
+from bs4 import BeautifulSoup
+def timeit(func):
+    @wraps(func)
+    def timeit_wrapper(*args, **kwargs):
+        start_time = time.perf_counter()
+        result = func(*args, **kwargs)
+        end_time = time.perf_counter()
+        total_time = end_time - start_time
+        print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
+        return result
+    return timeit_wrapper
+def timeout(timelimit):
+    def decorator(func):
+        def decorated(*args, **kwargs):
+            with futures.ThreadPoolExecutor(max_workers=1) as executor:
+                future = executor.submit(func, *args, **kwargs)
+                try:
+                    result = future.result(timelimit)
+                except futures.TimeoutError:
+                    print('Time out!')
+                    raise TimeoutError from None
+                else:
+                    print(result)
+                executor._threads.clear()
+                futures.thread._threads_queues.clear()
+                return result
+        return decorated
+    return decorator
+def get_current_time(timezone="Asia/Taipei"):
+    current_time_utc = datetime.utcnow()
+    target_timezone = pytz.timezone(timezone)
+    target_current_time = current_time_utc.replace(
+        tzinfo=pytz.utc).astimezone(target_timezone)
+    return target_current_time
+def get_account_status(userid, profile_soup=None):
+    if userid == "":
+        return "missing"
+    else:
+        private_span = profile_soup.find(
+            "span", class_="ident private icon icon_lock")
+        if private_span:
+            return "private"
+        return "public"
+def has_all_data_been_collected(scraped_items:pd.DataFrame,counts_of_posts):
+    """Whether program get all posts already."""
+    if len(set([each["shortcode"] for each in scraped_items])) >= int(counts_of_posts):
+        return True
+    return False
+def is_date_exceed_half_year(scraped_items:pd.DataFrame, days_limit:int):
+    """Check if scraped posts' published date exceed half year"""
+    current_time = datetime.now()
+    days_ago_list = [int(
+        (current_time - pd.to_datetime(each["time"], unit="s")).days) for each in scraped_items]
+    max_days_ago = max(days_ago_list) # 爬到的貼文裡, 發文時間距離當前時間最遠的日期
+    if max_days_ago > days_limit:  # 半年內
+        return True
+    return False
+def get_valid_headers_cookies(username: str):
+    # 1. Get the main script directory (not the utils directory)
+    main_dir = Path(__file__).resolve().parent.parent  # one level above utils
+    json_dir = main_dir / "auth_data" # directory to store headers/cookies
+    json_dir.mkdir(exist_ok=True)  # create directory if it doesn't exist
+    json_path = json_dir / f"instagram_posts_scraper_headers.json" # path to the JSON file
+    url = f"https://www.pixnoy.com/profile/{username}"
+    # 2. Use Selenium to bypass Cloudflare and save headers/cookies
+    def crawl_and_save():
+        print("⚠️ Launching Selenium to bypass Cloudflare...")
+        driver = Driver(uc=True, headless=True)
+        driver.uc_open_with_reconnect(url)
+        time.sleep(10)
+        cookies = {c['name']: c['value'] for c in driver.get_cookies()}
+        user_agent = driver.execute_script("return navigator.userAgent;")
+        headers = {"User-Agent": user_agent}
+        # save json
+        with open(json_path, "w") as f:
+            json.dump({"headers": headers, "cookies": cookies}, f, indent=2)
+        driver.quit()
+        print("Already update headers and cookies")
+        return headers, cookies
+    # 2. check json 是否存在
+    if json_path.exists():
+        with open(json_path, "r") as f:
+            try:
+                data = json.load(f)
+                headers = data["headers"]
+                cookies = data["cookies"]
+                print("Attempting to use cached headers and cookies...")
+                resp = requests.get(url, headers=headers, cookies=cookies)
+                if resp.status_code == 200:
+                    print("Cache is valid. Using cached data.")
+                    return headers, cookies
+                else:
+                    print(f"Cache is invalid. Status code: {resp.status_code}. Fetching new data...")
+                    return crawl_and_save()
+            except Exception as e:
+                print("Failed to read JSON file. Re-fetching headers and cookies.")
+                return crawl_and_save()
+    else:
+        return crawl_and_save()
+def get_scraper_utils(html:str):
+    soup = BeautifulSoup(html, 'html.parser')
+    userid = soup.find('input', {'name': 'userid'})['value']
+    username = soup.find('input', {'name': 'username'})['value']
+    more_btns = soup.select('a.more_btn') # find all a.more_btn
+    for btn in more_btns: # Filter data-next (exists value)
+        data_next = btn.get('data-next')
+        if data_next:
+            clean_data_next = data_next.rstrip('=')
+            data_maxid = btn.get('data-maxid')
+            break
+    return {
+        "userid":userid,
+        "username":username,
+        "clean_data_next":clean_data_next,
+        "data_maxid":data_maxid
+    }

{instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: instagram-posts-scraper
-Version: 0.0.2
+Version: 0.0.3
 Summary: Implement Instagram Posts Scraper for post data retrieval
 Home-page: https://github.com/FaustRen/instagram-posts-scraper
 Author: FaustRen
@@ -19,6 +19,7 @@ Dynamic: description
 Dynamic: description-content-type
 Dynamic: home-page
 Dynamic: license
+Dynamic: license-file
 Dynamic: requires-python
 Dynamic: summary
@@ -49,6 +50,8 @@ beautifulsoup4==4.13.3
 cloudscraper==1.2.71
 pandas==2.2.3
 pytz==2025.1
+seleniumbase==4.38.0
+requests==2.32.3
 ```
 ## Installation

{instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/setup.py RENAMED Viewed

@@ -4,7 +4,7 @@ from setuptools import setup, find_packages
 setup(
     name='instagram-posts-scraper',
-    version='0.0.2',
+    version='0.0.3',
     packages=[
         "instagram_posts_scraper",
         "instagram_posts_scraper.utils"

instagram_posts_scraper-0.0.2/instagram_posts_scraper/utils/utils.py DELETED Viewed

@@ -1,71 +0,0 @@
-# -*- coding: utf-8 -*-
-import concurrent.futures as futures
-from datetime import datetime
-import pytz
-import pandas as pd
-from functools import wraps
-import time
-def timeit(func):
-    @wraps(func)
-    def timeit_wrapper(*args, **kwargs):
-        start_time = time.perf_counter()
-        result = func(*args, **kwargs)
-        end_time = time.perf_counter()
-        total_time = end_time - start_time
-        print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
-        return result
-    return timeit_wrapper
-def timeout(timelimit):
-    def decorator(func):
-        def decorated(*args, **kwargs):
-            with futures.ThreadPoolExecutor(max_workers=1) as executor:
-                future = executor.submit(func, *args, **kwargs)
-                try:
-                    result = future.result(timelimit)
-                except futures.TimeoutError:
-                    print('Time out!')
-                    raise TimeoutError from None
-                else:
-                    print(result)
-                executor._threads.clear()
-                futures.thread._threads_queues.clear()
-                return result
-        return decorated
-    return decorator
-def get_current_time(timezone="Asia/Taipei"):
-    current_time_utc = datetime.utcnow()
-    target_timezone = pytz.timezone(timezone)
-    target_current_time = current_time_utc.replace(
-        tzinfo=pytz.utc).astimezone(target_timezone)
-    return target_current_time
-def get_account_status(userid, profile_soup=None):
-    if userid == "":
-        return "missing"
-    else:
-        private_span = profile_soup.find(
-            "span", class_="ident private icon icon_lock")
-        if private_span:
-            return "private"
-        return "public"
-def has_all_data_been_collected(scraped_items:pd.DataFrame,counts_of_posts):
-    """Whether program get all posts already."""
-    if len(set([each["shortcode"] for each in scraped_items])) >= int(counts_of_posts):
-        return True
-    return False
-def is_date_exceed_half_year(scraped_items:pd.DataFrame, days_limit:int):
-    """Check if scraped posts' published date exceed half year"""
-    current_time = datetime.now()
-    days_ago_list = [int(
-        (current_time - pd.to_datetime(each["time"], unit="s")).days) for each in scraped_items]
-    max_days_ago = max(days_ago_list) # 爬到的貼文裡, 發文時間距離當前時間最遠的日期
-    if max_days_ago > days_limit:  # 半年內
-        return True
-    return False