instagram-posts-scraper 0.0.2__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/PKG-INFO +5 -2
  2. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/instagram_posts_scraper.py +26 -4
  3. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/parse.py +41 -0
  4. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/request.py +12 -1
  5. instagram_posts_scraper-0.0.3/instagram_posts_scraper/utils/utils.py +145 -0
  6. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper.egg-info/PKG-INFO +5 -2
  7. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/setup.py +1 -1
  8. instagram_posts_scraper-0.0.2/instagram_posts_scraper/utils/utils.py +0 -71
  9. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/LICENSE +0 -0
  10. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/__init__.py +0 -0
  11. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/example.py +0 -0
  12. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/file_operation.py +0 -0
  13. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/scraper.py +0 -0
  14. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/utils/__init__.py +0 -0
  15. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/utils.py +0 -0
  16. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper.egg-info/SOURCES.txt +0 -0
  17. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper.egg-info/dependency_links.txt +0 -0
  18. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper.egg-info/top_level.txt +0 -0
  19. {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: instagram-posts-scraper
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: Implement Instagram Posts Scraper for post data retrieval
5
5
  Home-page: https://github.com/FaustRen/instagram-posts-scraper
6
6
  Author: FaustRen
@@ -19,6 +19,7 @@ Dynamic: description
19
19
  Dynamic: description-content-type
20
20
  Dynamic: home-page
21
21
  Dynamic: license
22
+ Dynamic: license-file
22
23
  Dynamic: requires-python
23
24
  Dynamic: summary
24
25
 
@@ -49,6 +50,8 @@ beautifulsoup4==4.13.3
49
50
  cloudscraper==1.2.71
50
51
  pandas==2.2.3
51
52
  pytz==2025.1
53
+ seleniumbase==4.38.0
54
+ requests==2.32.3
52
55
  ```
53
56
 
54
57
  ## Installation
@@ -25,8 +25,8 @@ class InstaPeriodScraper(object):
25
25
  )
26
26
 
27
27
  def check_account_is_public(self):
28
- init_response = self.pixwox_request.send_requests(url=self.scraper.init_url)
29
- self.profile_soup = self.parser.get_soup(response=init_response)
28
+ self.init_response = self.pixwox_request.send_requests(url=self.scraper.init_url)
29
+ self.profile_soup = self.parser.get_soup(response=self.init_response)
30
30
  self.userid = self.parser.get_userid(profile_soup=self.profile_soup)
31
31
  self.account_status = get_account_status(userid=self.userid, profile_soup=self.profile_soup)
32
32
  return self.account_status == "public"
@@ -227,6 +227,12 @@ class InstaPeriodScraper(object):
227
227
  username = self.target_info["username"]
228
228
  self.scraper.set_username(username)
229
229
  days_limit = target_info["days_limit"]
230
+
231
+ # check if user-agent & cookies are valid first
232
+ print("# check if user-agent & cookies are valid first")
233
+ valid_headers_cookies = get_valid_headers_cookies(username=username)
234
+ self.pixwox_request.set_valid_headers_cookies(valid_headers_cookies=valid_headers_cookies)
235
+
230
236
  if not self.check_account_is_public():
231
237
  print("This is private account")
232
238
  if self.account_status == "private":
@@ -238,9 +244,21 @@ class InstaPeriodScraper(object):
238
244
  return res
239
245
 
240
246
  if self.check_account_is_public():
241
- init_api_data = self.get_init_api_data() # 帳號資訊 & 上方頁面內容
242
- self.get_profile()
247
+ self.scraper_utils = get_scraper_utils(html=self.init_response.text) # new
243
248
  print(f"This is public account")
249
+ # get_scraper_utils
250
+ init_response = self.pixwox_request.send_requests(url=self.scraper.init_url)
251
+
252
+
253
+ # init_api_data = self.get_init_api_data() # 帳號資訊 & 上方頁面內容
254
+ userid = self.scraper_utils["userid"]
255
+ username = self.scraper_utils["username"]
256
+ next_maxid = self.scraper_utils["data_maxid"]
257
+ next_ = self.scraper_utils["clean_data_next"]
258
+ next_api = f"https://www.piokok.com/api/posts?username={username}&userid={userid}&next={next_}==&maxid={next_maxid}"
259
+ init_api_data = self.pixwox_request.send_requests(url=next_api) # actually, this is next..
260
+ init_api_data = init_api_data.json()
261
+ self.get_profile()
244
262
  # can scrape next round's posts
245
263
  if init_api_data["posts"]["has_next"] != False:
246
264
  maxid = init_api_data["posts"]["maxid"]
@@ -256,8 +274,12 @@ class InstaPeriodScraper(object):
256
274
  scraped_posts=period_posts,
257
275
  init_api_data=init_api_data
258
276
  )
277
+ init_posts = self.parser.extract_init_posts(init_response.text)
278
+ res["init_posts"] = init_posts
259
279
  return res
260
280
  # # no more posts
261
281
  elif init_api_data["posts"]["has_next"] == False: # (表示該帳號貼文數<=12, 無法繼續往下找)
262
282
  res = self.get_public_account_res(scraped_posts=init_api_data["posts"]["items"], init_api_data=init_api_data)
283
+ init_posts = self.parser.extract_init_posts(init_response.text)
284
+ res["init_posts"] = init_posts
263
285
  return res
@@ -38,6 +38,47 @@ class Parser(object):
38
38
  if each_info.text != ' ':
39
39
  user_info_list.append(str(each_info.text))
40
40
  return user_info_list
41
+
42
+ @staticmethod
43
+ def extract_init_posts(html):
44
+ """
45
+ Extracts post details (text, likes, comments, time) from HTML content.
46
+
47
+ Args:
48
+ html (str): The full HTML content of the page.
49
+
50
+ Returns:
51
+ list[dict]: A list of dictionaries containing post information.
52
+ """
53
+ soup = BeautifulSoup(html, "html.parser")
54
+
55
+ # Find all post items under .posts > .items > .item
56
+ items = soup.select("div.posts div.items div.item")
57
+
58
+ init_posts = []
59
+
60
+ for item in items:
61
+ post = {}
62
+
63
+ # 1. Post content text
64
+ sum_div = item.select_one("div.sum")
65
+ post["text"] = sum_div.text.strip() if sum_div else None
66
+
67
+ # 2. Like count
68
+ like_span = item.select_one("span.count_item_like span.num")
69
+ post["likes"] = like_span.text.strip() if like_span else None
70
+
71
+ # 3. Comment count
72
+ comment_span = item.select_one("span.count_item_comment span.num")
73
+ post["comments"] = comment_span.text.strip() if comment_span else None
74
+
75
+ # 4. Post date/time
76
+ time_span = item.select_one("div.time span.txt")
77
+ post["time"] = time_span.text.strip() if time_span else None
78
+
79
+ init_posts.append(post)
80
+
81
+ return init_posts
41
82
 
42
83
  def get_soup(self, response):
43
84
  soup = BeautifulSoup(response.text, self.__DEFAULT_SOUP_PARSER)
@@ -2,6 +2,7 @@
2
2
  import json
3
3
  import cloudscraper
4
4
  from bs4 import BeautifulSoup
5
+ import requests
5
6
 
6
7
 
7
8
  class PixwoxRequest(object):
@@ -14,8 +15,18 @@ class PixwoxRequest(object):
14
15
  "mobile": "False"})
15
16
 
16
17
  def send_requests(self, url):
17
- response = self.__scraper.get(url)
18
+ # response = self.__scraper.get(url) # temporary stop this good method :(
19
+ response = requests.get(
20
+ url=url,
21
+ headers={"User-Agent":self.__user_agent},
22
+ cookies=self.__cookies
23
+ )
18
24
  return response
25
+
26
+ def set_valid_headers_cookies(self, valid_headers_cookies):
27
+ self.__valid_headers_cookies = valid_headers_cookies
28
+ self.__user_agent = self.__valid_headers_cookies[0].get("User-Agent")
29
+ self.__cookies = self.__valid_headers_cookies[1]
19
30
 
20
31
  def get_init_content(self, username: str) -> str:
21
32
  get_url = f"https://www.picnob.com/zh-hant/profile/{username}"
@@ -0,0 +1,145 @@
1
+ # -*- coding: utf-8 -*-
2
+ import concurrent.futures as futures
3
+ from datetime import datetime
4
+ import pytz
5
+ import pandas as pd
6
+ from functools import wraps
7
+ import time
8
+ import os
9
+ import json
10
+ import requests
11
+ from seleniumbase import Driver
12
+ from pathlib import Path
13
+ from bs4 import BeautifulSoup
14
+
15
+
16
+ def timeit(func):
17
+ @wraps(func)
18
+ def timeit_wrapper(*args, **kwargs):
19
+ start_time = time.perf_counter()
20
+ result = func(*args, **kwargs)
21
+ end_time = time.perf_counter()
22
+ total_time = end_time - start_time
23
+ print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
24
+ return result
25
+ return timeit_wrapper
26
+
27
+ def timeout(timelimit):
28
+ def decorator(func):
29
+ def decorated(*args, **kwargs):
30
+ with futures.ThreadPoolExecutor(max_workers=1) as executor:
31
+ future = executor.submit(func, *args, **kwargs)
32
+ try:
33
+ result = future.result(timelimit)
34
+ except futures.TimeoutError:
35
+ print('Time out!')
36
+ raise TimeoutError from None
37
+ else:
38
+ print(result)
39
+ executor._threads.clear()
40
+ futures.thread._threads_queues.clear()
41
+ return result
42
+ return decorated
43
+ return decorator
44
+
45
+ def get_current_time(timezone="Asia/Taipei"):
46
+ current_time_utc = datetime.utcnow()
47
+ target_timezone = pytz.timezone(timezone)
48
+ target_current_time = current_time_utc.replace(
49
+ tzinfo=pytz.utc).astimezone(target_timezone)
50
+ return target_current_time
51
+
52
+ def get_account_status(userid, profile_soup=None):
53
+ if userid == "":
54
+ return "missing"
55
+ else:
56
+ private_span = profile_soup.find(
57
+ "span", class_="ident private icon icon_lock")
58
+ if private_span:
59
+ return "private"
60
+ return "public"
61
+
62
+ def has_all_data_been_collected(scraped_items:pd.DataFrame,counts_of_posts):
63
+ """Whether program get all posts already."""
64
+ if len(set([each["shortcode"] for each in scraped_items])) >= int(counts_of_posts):
65
+ return True
66
+ return False
67
+
68
+ def is_date_exceed_half_year(scraped_items:pd.DataFrame, days_limit:int):
69
+ """Check if scraped posts' published date exceed half year"""
70
+ current_time = datetime.now()
71
+ days_ago_list = [int(
72
+ (current_time - pd.to_datetime(each["time"], unit="s")).days) for each in scraped_items]
73
+
74
+ max_days_ago = max(days_ago_list) # 爬到的貼文裡, 發文時間距離當前時間最遠的日期
75
+ if max_days_ago > days_limit: # 半年內
76
+ return True
77
+ return False
78
+
79
+ def get_valid_headers_cookies(username: str):
80
+ # 1. Get the main script directory (not the utils directory)
81
+ main_dir = Path(__file__).resolve().parent.parent # one level above utils
82
+ json_dir = main_dir / "auth_data" # directory to store headers/cookies
83
+ json_dir.mkdir(exist_ok=True) # create directory if it doesn't exist
84
+ json_path = json_dir / f"instagram_posts_scraper_headers.json" # path to the JSON file
85
+ url = f"https://www.pixnoy.com/profile/{username}"
86
+
87
+ # 2. Use Selenium to bypass Cloudflare and save headers/cookies
88
+ def crawl_and_save():
89
+ print("⚠️ Launching Selenium to bypass Cloudflare...")
90
+ driver = Driver(uc=True, headless=True)
91
+ driver.uc_open_with_reconnect(url)
92
+ time.sleep(10)
93
+
94
+ cookies = {c['name']: c['value'] for c in driver.get_cookies()}
95
+ user_agent = driver.execute_script("return navigator.userAgent;")
96
+ headers = {"User-Agent": user_agent}
97
+
98
+ # save json
99
+ with open(json_path, "w") as f:
100
+ json.dump({"headers": headers, "cookies": cookies}, f, indent=2)
101
+
102
+ driver.quit()
103
+ print("Already update headers and cookies")
104
+ return headers, cookies
105
+
106
+ # 2. check json 是否存在
107
+ if json_path.exists():
108
+ with open(json_path, "r") as f:
109
+ try:
110
+ data = json.load(f)
111
+ headers = data["headers"]
112
+ cookies = data["cookies"]
113
+
114
+ print("Attempting to use cached headers and cookies...")
115
+ resp = requests.get(url, headers=headers, cookies=cookies)
116
+ if resp.status_code == 200:
117
+ print("Cache is valid. Using cached data.")
118
+ return headers, cookies
119
+ else:
120
+ print(f"Cache is invalid. Status code: {resp.status_code}. Fetching new data...")
121
+ return crawl_and_save()
122
+
123
+ except Exception as e:
124
+ print("Failed to read JSON file. Re-fetching headers and cookies.")
125
+ return crawl_and_save()
126
+ else:
127
+ return crawl_and_save()
128
+
129
+ def get_scraper_utils(html:str):
130
+ soup = BeautifulSoup(html, 'html.parser')
131
+ userid = soup.find('input', {'name': 'userid'})['value']
132
+ username = soup.find('input', {'name': 'username'})['value']
133
+ more_btns = soup.select('a.more_btn') # find all a.more_btn
134
+ for btn in more_btns: # Filter data-next (exists value)
135
+ data_next = btn.get('data-next')
136
+ if data_next:
137
+ clean_data_next = data_next.rstrip('=')
138
+ data_maxid = btn.get('data-maxid')
139
+ break
140
+ return {
141
+ "userid":userid,
142
+ "username":username,
143
+ "clean_data_next":clean_data_next,
144
+ "data_maxid":data_maxid
145
+ }
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: instagram-posts-scraper
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: Implement Instagram Posts Scraper for post data retrieval
5
5
  Home-page: https://github.com/FaustRen/instagram-posts-scraper
6
6
  Author: FaustRen
@@ -19,6 +19,7 @@ Dynamic: description
19
19
  Dynamic: description-content-type
20
20
  Dynamic: home-page
21
21
  Dynamic: license
22
+ Dynamic: license-file
22
23
  Dynamic: requires-python
23
24
  Dynamic: summary
24
25
 
@@ -49,6 +50,8 @@ beautifulsoup4==4.13.3
49
50
  cloudscraper==1.2.71
50
51
  pandas==2.2.3
51
52
  pytz==2025.1
53
+ seleniumbase==4.38.0
54
+ requests==2.32.3
52
55
  ```
53
56
 
54
57
  ## Installation
@@ -4,7 +4,7 @@ from setuptools import setup, find_packages
4
4
 
5
5
  setup(
6
6
  name='instagram-posts-scraper',
7
- version='0.0.2',
7
+ version='0.0.3',
8
8
  packages=[
9
9
  "instagram_posts_scraper",
10
10
  "instagram_posts_scraper.utils"
@@ -1,71 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- import concurrent.futures as futures
3
- from datetime import datetime
4
- import pytz
5
- import pandas as pd
6
- from functools import wraps
7
- import time
8
-
9
-
10
- def timeit(func):
11
- @wraps(func)
12
- def timeit_wrapper(*args, **kwargs):
13
- start_time = time.perf_counter()
14
- result = func(*args, **kwargs)
15
- end_time = time.perf_counter()
16
- total_time = end_time - start_time
17
- print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
18
- return result
19
- return timeit_wrapper
20
-
21
- def timeout(timelimit):
22
- def decorator(func):
23
- def decorated(*args, **kwargs):
24
- with futures.ThreadPoolExecutor(max_workers=1) as executor:
25
- future = executor.submit(func, *args, **kwargs)
26
- try:
27
- result = future.result(timelimit)
28
- except futures.TimeoutError:
29
- print('Time out!')
30
- raise TimeoutError from None
31
- else:
32
- print(result)
33
- executor._threads.clear()
34
- futures.thread._threads_queues.clear()
35
- return result
36
- return decorated
37
- return decorator
38
-
39
- def get_current_time(timezone="Asia/Taipei"):
40
- current_time_utc = datetime.utcnow()
41
- target_timezone = pytz.timezone(timezone)
42
- target_current_time = current_time_utc.replace(
43
- tzinfo=pytz.utc).astimezone(target_timezone)
44
- return target_current_time
45
-
46
- def get_account_status(userid, profile_soup=None):
47
- if userid == "":
48
- return "missing"
49
- else:
50
- private_span = profile_soup.find(
51
- "span", class_="ident private icon icon_lock")
52
- if private_span:
53
- return "private"
54
- return "public"
55
-
56
- def has_all_data_been_collected(scraped_items:pd.DataFrame,counts_of_posts):
57
- """Whether program get all posts already."""
58
- if len(set([each["shortcode"] for each in scraped_items])) >= int(counts_of_posts):
59
- return True
60
- return False
61
-
62
- def is_date_exceed_half_year(scraped_items:pd.DataFrame, days_limit:int):
63
- """Check if scraped posts' published date exceed half year"""
64
- current_time = datetime.now()
65
- days_ago_list = [int(
66
- (current_time - pd.to_datetime(each["time"], unit="s")).days) for each in scraped_items]
67
-
68
- max_days_ago = max(days_ago_list) # 爬到的貼文裡, 發文時間距離當前時間最遠的日期
69
- if max_days_ago > days_limit: # 半年內
70
- return True
71
- return False