instagram-posts-scraper 0.0.2__tar.gz → 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/PKG-INFO +5 -2
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/instagram_posts_scraper.py +26 -4
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/parse.py +41 -0
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/request.py +12 -1
- instagram_posts_scraper-0.0.3/instagram_posts_scraper/utils/utils.py +145 -0
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper.egg-info/PKG-INFO +5 -2
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/setup.py +1 -1
- instagram_posts_scraper-0.0.2/instagram_posts_scraper/utils/utils.py +0 -71
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/LICENSE +0 -0
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/__init__.py +0 -0
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/example.py +0 -0
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/file_operation.py +0 -0
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/scraper.py +0 -0
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/utils/__init__.py +0 -0
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/utils.py +0 -0
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper.egg-info/SOURCES.txt +0 -0
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper.egg-info/dependency_links.txt +0 -0
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper.egg-info/top_level.txt +0 -0
- {instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: instagram-posts-scraper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Summary: Implement Instagram Posts Scraper for post data retrieval
|
|
5
5
|
Home-page: https://github.com/FaustRen/instagram-posts-scraper
|
|
6
6
|
Author: FaustRen
|
|
@@ -19,6 +19,7 @@ Dynamic: description
|
|
|
19
19
|
Dynamic: description-content-type
|
|
20
20
|
Dynamic: home-page
|
|
21
21
|
Dynamic: license
|
|
22
|
+
Dynamic: license-file
|
|
22
23
|
Dynamic: requires-python
|
|
23
24
|
Dynamic: summary
|
|
24
25
|
|
|
@@ -49,6 +50,8 @@ beautifulsoup4==4.13.3
|
|
|
49
50
|
cloudscraper==1.2.71
|
|
50
51
|
pandas==2.2.3
|
|
51
52
|
pytz==2025.1
|
|
53
|
+
seleniumbase==4.38.0
|
|
54
|
+
requests==2.32.3
|
|
52
55
|
```
|
|
53
56
|
|
|
54
57
|
## Installation
|
|
@@ -25,8 +25,8 @@ class InstaPeriodScraper(object):
|
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
def check_account_is_public(self):
|
|
28
|
-
init_response = self.pixwox_request.send_requests(url=self.scraper.init_url)
|
|
29
|
-
self.profile_soup = self.parser.get_soup(response=init_response)
|
|
28
|
+
self.init_response = self.pixwox_request.send_requests(url=self.scraper.init_url)
|
|
29
|
+
self.profile_soup = self.parser.get_soup(response=self.init_response)
|
|
30
30
|
self.userid = self.parser.get_userid(profile_soup=self.profile_soup)
|
|
31
31
|
self.account_status = get_account_status(userid=self.userid, profile_soup=self.profile_soup)
|
|
32
32
|
return self.account_status == "public"
|
|
@@ -227,6 +227,12 @@ class InstaPeriodScraper(object):
|
|
|
227
227
|
username = self.target_info["username"]
|
|
228
228
|
self.scraper.set_username(username)
|
|
229
229
|
days_limit = target_info["days_limit"]
|
|
230
|
+
|
|
231
|
+
# check if user-agent & cookies are valid first
|
|
232
|
+
print("# check if user-agent & cookies are valid first")
|
|
233
|
+
valid_headers_cookies = get_valid_headers_cookies(username=username)
|
|
234
|
+
self.pixwox_request.set_valid_headers_cookies(valid_headers_cookies=valid_headers_cookies)
|
|
235
|
+
|
|
230
236
|
if not self.check_account_is_public():
|
|
231
237
|
print("This is private account")
|
|
232
238
|
if self.account_status == "private":
|
|
@@ -238,9 +244,21 @@ class InstaPeriodScraper(object):
|
|
|
238
244
|
return res
|
|
239
245
|
|
|
240
246
|
if self.check_account_is_public():
|
|
241
|
-
|
|
242
|
-
self.get_profile()
|
|
247
|
+
self.scraper_utils = get_scraper_utils(html=self.init_response.text) # new
|
|
243
248
|
print(f"This is public account")
|
|
249
|
+
# get_scraper_utils
|
|
250
|
+
init_response = self.pixwox_request.send_requests(url=self.scraper.init_url)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# init_api_data = self.get_init_api_data() # 帳號資訊 & 上方頁面內容
|
|
254
|
+
userid = self.scraper_utils["userid"]
|
|
255
|
+
username = self.scraper_utils["username"]
|
|
256
|
+
next_maxid = self.scraper_utils["data_maxid"]
|
|
257
|
+
next_ = self.scraper_utils["clean_data_next"]
|
|
258
|
+
next_api = f"https://www.piokok.com/api/posts?username={username}&userid={userid}&next={next_}==&maxid={next_maxid}"
|
|
259
|
+
init_api_data = self.pixwox_request.send_requests(url=next_api) # actually, this is next..
|
|
260
|
+
init_api_data = init_api_data.json()
|
|
261
|
+
self.get_profile()
|
|
244
262
|
# can scrape next round's posts
|
|
245
263
|
if init_api_data["posts"]["has_next"] != False:
|
|
246
264
|
maxid = init_api_data["posts"]["maxid"]
|
|
@@ -256,8 +274,12 @@ class InstaPeriodScraper(object):
|
|
|
256
274
|
scraped_posts=period_posts,
|
|
257
275
|
init_api_data=init_api_data
|
|
258
276
|
)
|
|
277
|
+
init_posts = self.parser.extract_init_posts(init_response.text)
|
|
278
|
+
res["init_posts"] = init_posts
|
|
259
279
|
return res
|
|
260
280
|
# # no more posts
|
|
261
281
|
elif init_api_data["posts"]["has_next"] == False: # (表示該帳號貼文數<=12, 無法繼續往下找)
|
|
262
282
|
res = self.get_public_account_res(scraped_posts=init_api_data["posts"]["items"], init_api_data=init_api_data)
|
|
283
|
+
init_posts = self.parser.extract_init_posts(init_response.text)
|
|
284
|
+
res["init_posts"] = init_posts
|
|
263
285
|
return res
|
{instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/parse.py
RENAMED
|
@@ -38,6 +38,47 @@ class Parser(object):
|
|
|
38
38
|
if each_info.text != ' ':
|
|
39
39
|
user_info_list.append(str(each_info.text))
|
|
40
40
|
return user_info_list
|
|
41
|
+
|
|
42
|
+
@staticmethod
|
|
43
|
+
def extract_init_posts(html):
|
|
44
|
+
"""
|
|
45
|
+
Extracts post details (text, likes, comments, time) from HTML content.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
html (str): The full HTML content of the page.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
list[dict]: A list of dictionaries containing post information.
|
|
52
|
+
"""
|
|
53
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
54
|
+
|
|
55
|
+
# Find all post items under .posts > .items > .item
|
|
56
|
+
items = soup.select("div.posts div.items div.item")
|
|
57
|
+
|
|
58
|
+
init_posts = []
|
|
59
|
+
|
|
60
|
+
for item in items:
|
|
61
|
+
post = {}
|
|
62
|
+
|
|
63
|
+
# 1. Post content text
|
|
64
|
+
sum_div = item.select_one("div.sum")
|
|
65
|
+
post["text"] = sum_div.text.strip() if sum_div else None
|
|
66
|
+
|
|
67
|
+
# 2. Like count
|
|
68
|
+
like_span = item.select_one("span.count_item_like span.num")
|
|
69
|
+
post["likes"] = like_span.text.strip() if like_span else None
|
|
70
|
+
|
|
71
|
+
# 3. Comment count
|
|
72
|
+
comment_span = item.select_one("span.count_item_comment span.num")
|
|
73
|
+
post["comments"] = comment_span.text.strip() if comment_span else None
|
|
74
|
+
|
|
75
|
+
# 4. Post date/time
|
|
76
|
+
time_span = item.select_one("div.time span.txt")
|
|
77
|
+
post["time"] = time_span.text.strip() if time_span else None
|
|
78
|
+
|
|
79
|
+
init_posts.append(post)
|
|
80
|
+
|
|
81
|
+
return init_posts
|
|
41
82
|
|
|
42
83
|
def get_soup(self, response):
|
|
43
84
|
soup = BeautifulSoup(response.text, self.__DEFAULT_SOUP_PARSER)
|
{instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/request.py
RENAMED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import json
|
|
3
3
|
import cloudscraper
|
|
4
4
|
from bs4 import BeautifulSoup
|
|
5
|
+
import requests
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class PixwoxRequest(object):
|
|
@@ -14,8 +15,18 @@ class PixwoxRequest(object):
|
|
|
14
15
|
"mobile": "False"})
|
|
15
16
|
|
|
16
17
|
def send_requests(self, url):
|
|
17
|
-
response = self.__scraper.get(url)
|
|
18
|
+
# response = self.__scraper.get(url) # temporary stop this good method :(
|
|
19
|
+
response = requests.get(
|
|
20
|
+
url=url,
|
|
21
|
+
headers={"User-Agent":self.__user_agent},
|
|
22
|
+
cookies=self.__cookies
|
|
23
|
+
)
|
|
18
24
|
return response
|
|
25
|
+
|
|
26
|
+
def set_valid_headers_cookies(self, valid_headers_cookies):
|
|
27
|
+
self.__valid_headers_cookies = valid_headers_cookies
|
|
28
|
+
self.__user_agent = self.__valid_headers_cookies[0].get("User-Agent")
|
|
29
|
+
self.__cookies = self.__valid_headers_cookies[1]
|
|
19
30
|
|
|
20
31
|
def get_init_content(self, username: str) -> str:
|
|
21
32
|
get_url = f"https://www.picnob.com/zh-hant/profile/{username}"
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import concurrent.futures as futures
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import pytz
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from functools import wraps
|
|
7
|
+
import time
|
|
8
|
+
import os
|
|
9
|
+
import json
|
|
10
|
+
import requests
|
|
11
|
+
from seleniumbase import Driver
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from bs4 import BeautifulSoup
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def timeit(func):
|
|
17
|
+
@wraps(func)
|
|
18
|
+
def timeit_wrapper(*args, **kwargs):
|
|
19
|
+
start_time = time.perf_counter()
|
|
20
|
+
result = func(*args, **kwargs)
|
|
21
|
+
end_time = time.perf_counter()
|
|
22
|
+
total_time = end_time - start_time
|
|
23
|
+
print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
|
|
24
|
+
return result
|
|
25
|
+
return timeit_wrapper
|
|
26
|
+
|
|
27
|
+
def timeout(timelimit):
|
|
28
|
+
def decorator(func):
|
|
29
|
+
def decorated(*args, **kwargs):
|
|
30
|
+
with futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
31
|
+
future = executor.submit(func, *args, **kwargs)
|
|
32
|
+
try:
|
|
33
|
+
result = future.result(timelimit)
|
|
34
|
+
except futures.TimeoutError:
|
|
35
|
+
print('Time out!')
|
|
36
|
+
raise TimeoutError from None
|
|
37
|
+
else:
|
|
38
|
+
print(result)
|
|
39
|
+
executor._threads.clear()
|
|
40
|
+
futures.thread._threads_queues.clear()
|
|
41
|
+
return result
|
|
42
|
+
return decorated
|
|
43
|
+
return decorator
|
|
44
|
+
|
|
45
|
+
def get_current_time(timezone="Asia/Taipei"):
|
|
46
|
+
current_time_utc = datetime.utcnow()
|
|
47
|
+
target_timezone = pytz.timezone(timezone)
|
|
48
|
+
target_current_time = current_time_utc.replace(
|
|
49
|
+
tzinfo=pytz.utc).astimezone(target_timezone)
|
|
50
|
+
return target_current_time
|
|
51
|
+
|
|
52
|
+
def get_account_status(userid, profile_soup=None):
|
|
53
|
+
if userid == "":
|
|
54
|
+
return "missing"
|
|
55
|
+
else:
|
|
56
|
+
private_span = profile_soup.find(
|
|
57
|
+
"span", class_="ident private icon icon_lock")
|
|
58
|
+
if private_span:
|
|
59
|
+
return "private"
|
|
60
|
+
return "public"
|
|
61
|
+
|
|
62
|
+
def has_all_data_been_collected(scraped_items:pd.DataFrame,counts_of_posts):
|
|
63
|
+
"""Whether program get all posts already."""
|
|
64
|
+
if len(set([each["shortcode"] for each in scraped_items])) >= int(counts_of_posts):
|
|
65
|
+
return True
|
|
66
|
+
return False
|
|
67
|
+
|
|
68
|
+
def is_date_exceed_half_year(scraped_items:pd.DataFrame, days_limit:int):
|
|
69
|
+
"""Check if scraped posts' published date exceed half year"""
|
|
70
|
+
current_time = datetime.now()
|
|
71
|
+
days_ago_list = [int(
|
|
72
|
+
(current_time - pd.to_datetime(each["time"], unit="s")).days) for each in scraped_items]
|
|
73
|
+
|
|
74
|
+
max_days_ago = max(days_ago_list) # 爬到的貼文裡, 發文時間距離當前時間最遠的日期
|
|
75
|
+
if max_days_ago > days_limit: # 半年內
|
|
76
|
+
return True
|
|
77
|
+
return False
|
|
78
|
+
|
|
79
|
+
def get_valid_headers_cookies(username: str):
|
|
80
|
+
# 1. Get the main script directory (not the utils directory)
|
|
81
|
+
main_dir = Path(__file__).resolve().parent.parent # one level above utils
|
|
82
|
+
json_dir = main_dir / "auth_data" # directory to store headers/cookies
|
|
83
|
+
json_dir.mkdir(exist_ok=True) # create directory if it doesn't exist
|
|
84
|
+
json_path = json_dir / f"instagram_posts_scraper_headers.json" # path to the JSON file
|
|
85
|
+
url = f"https://www.pixnoy.com/profile/{username}"
|
|
86
|
+
|
|
87
|
+
# 2. Use Selenium to bypass Cloudflare and save headers/cookies
|
|
88
|
+
def crawl_and_save():
|
|
89
|
+
print("⚠️ Launching Selenium to bypass Cloudflare...")
|
|
90
|
+
driver = Driver(uc=True, headless=True)
|
|
91
|
+
driver.uc_open_with_reconnect(url)
|
|
92
|
+
time.sleep(10)
|
|
93
|
+
|
|
94
|
+
cookies = {c['name']: c['value'] for c in driver.get_cookies()}
|
|
95
|
+
user_agent = driver.execute_script("return navigator.userAgent;")
|
|
96
|
+
headers = {"User-Agent": user_agent}
|
|
97
|
+
|
|
98
|
+
# save json
|
|
99
|
+
with open(json_path, "w") as f:
|
|
100
|
+
json.dump({"headers": headers, "cookies": cookies}, f, indent=2)
|
|
101
|
+
|
|
102
|
+
driver.quit()
|
|
103
|
+
print("Already update headers and cookies")
|
|
104
|
+
return headers, cookies
|
|
105
|
+
|
|
106
|
+
# 2. check json 是否存在
|
|
107
|
+
if json_path.exists():
|
|
108
|
+
with open(json_path, "r") as f:
|
|
109
|
+
try:
|
|
110
|
+
data = json.load(f)
|
|
111
|
+
headers = data["headers"]
|
|
112
|
+
cookies = data["cookies"]
|
|
113
|
+
|
|
114
|
+
print("Attempting to use cached headers and cookies...")
|
|
115
|
+
resp = requests.get(url, headers=headers, cookies=cookies)
|
|
116
|
+
if resp.status_code == 200:
|
|
117
|
+
print("Cache is valid. Using cached data.")
|
|
118
|
+
return headers, cookies
|
|
119
|
+
else:
|
|
120
|
+
print(f"Cache is invalid. Status code: {resp.status_code}. Fetching new data...")
|
|
121
|
+
return crawl_and_save()
|
|
122
|
+
|
|
123
|
+
except Exception as e:
|
|
124
|
+
print("Failed to read JSON file. Re-fetching headers and cookies.")
|
|
125
|
+
return crawl_and_save()
|
|
126
|
+
else:
|
|
127
|
+
return crawl_and_save()
|
|
128
|
+
|
|
129
|
+
def get_scraper_utils(html:str):
|
|
130
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
131
|
+
userid = soup.find('input', {'name': 'userid'})['value']
|
|
132
|
+
username = soup.find('input', {'name': 'username'})['value']
|
|
133
|
+
more_btns = soup.select('a.more_btn') # find all a.more_btn
|
|
134
|
+
for btn in more_btns: # Filter data-next (exists value)
|
|
135
|
+
data_next = btn.get('data-next')
|
|
136
|
+
if data_next:
|
|
137
|
+
clean_data_next = data_next.rstrip('=')
|
|
138
|
+
data_maxid = btn.get('data-maxid')
|
|
139
|
+
break
|
|
140
|
+
return {
|
|
141
|
+
"userid":userid,
|
|
142
|
+
"username":username,
|
|
143
|
+
"clean_data_next":clean_data_next,
|
|
144
|
+
"data_maxid":data_maxid
|
|
145
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: instagram-posts-scraper
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Summary: Implement Instagram Posts Scraper for post data retrieval
|
|
5
5
|
Home-page: https://github.com/FaustRen/instagram-posts-scraper
|
|
6
6
|
Author: FaustRen
|
|
@@ -19,6 +19,7 @@ Dynamic: description
|
|
|
19
19
|
Dynamic: description-content-type
|
|
20
20
|
Dynamic: home-page
|
|
21
21
|
Dynamic: license
|
|
22
|
+
Dynamic: license-file
|
|
22
23
|
Dynamic: requires-python
|
|
23
24
|
Dynamic: summary
|
|
24
25
|
|
|
@@ -49,6 +50,8 @@ beautifulsoup4==4.13.3
|
|
|
49
50
|
cloudscraper==1.2.71
|
|
50
51
|
pandas==2.2.3
|
|
51
52
|
pytz==2025.1
|
|
53
|
+
seleniumbase==4.38.0
|
|
54
|
+
requests==2.32.3
|
|
52
55
|
```
|
|
53
56
|
|
|
54
57
|
## Installation
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
import concurrent.futures as futures
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
import pytz
|
|
5
|
-
import pandas as pd
|
|
6
|
-
from functools import wraps
|
|
7
|
-
import time
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def timeit(func):
|
|
11
|
-
@wraps(func)
|
|
12
|
-
def timeit_wrapper(*args, **kwargs):
|
|
13
|
-
start_time = time.perf_counter()
|
|
14
|
-
result = func(*args, **kwargs)
|
|
15
|
-
end_time = time.perf_counter()
|
|
16
|
-
total_time = end_time - start_time
|
|
17
|
-
print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
|
|
18
|
-
return result
|
|
19
|
-
return timeit_wrapper
|
|
20
|
-
|
|
21
|
-
def timeout(timelimit):
|
|
22
|
-
def decorator(func):
|
|
23
|
-
def decorated(*args, **kwargs):
|
|
24
|
-
with futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
25
|
-
future = executor.submit(func, *args, **kwargs)
|
|
26
|
-
try:
|
|
27
|
-
result = future.result(timelimit)
|
|
28
|
-
except futures.TimeoutError:
|
|
29
|
-
print('Time out!')
|
|
30
|
-
raise TimeoutError from None
|
|
31
|
-
else:
|
|
32
|
-
print(result)
|
|
33
|
-
executor._threads.clear()
|
|
34
|
-
futures.thread._threads_queues.clear()
|
|
35
|
-
return result
|
|
36
|
-
return decorated
|
|
37
|
-
return decorator
|
|
38
|
-
|
|
39
|
-
def get_current_time(timezone="Asia/Taipei"):
|
|
40
|
-
current_time_utc = datetime.utcnow()
|
|
41
|
-
target_timezone = pytz.timezone(timezone)
|
|
42
|
-
target_current_time = current_time_utc.replace(
|
|
43
|
-
tzinfo=pytz.utc).astimezone(target_timezone)
|
|
44
|
-
return target_current_time
|
|
45
|
-
|
|
46
|
-
def get_account_status(userid, profile_soup=None):
|
|
47
|
-
if userid == "":
|
|
48
|
-
return "missing"
|
|
49
|
-
else:
|
|
50
|
-
private_span = profile_soup.find(
|
|
51
|
-
"span", class_="ident private icon icon_lock")
|
|
52
|
-
if private_span:
|
|
53
|
-
return "private"
|
|
54
|
-
return "public"
|
|
55
|
-
|
|
56
|
-
def has_all_data_been_collected(scraped_items:pd.DataFrame,counts_of_posts):
|
|
57
|
-
"""Whether program get all posts already."""
|
|
58
|
-
if len(set([each["shortcode"] for each in scraped_items])) >= int(counts_of_posts):
|
|
59
|
-
return True
|
|
60
|
-
return False
|
|
61
|
-
|
|
62
|
-
def is_date_exceed_half_year(scraped_items:pd.DataFrame, days_limit:int):
|
|
63
|
-
"""Check if scraped posts' published date exceed half year"""
|
|
64
|
-
current_time = datetime.now()
|
|
65
|
-
days_ago_list = [int(
|
|
66
|
-
(current_time - pd.to_datetime(each["time"], unit="s")).days) for each in scraped_items]
|
|
67
|
-
|
|
68
|
-
max_days_ago = max(days_ago_list) # 爬到的貼文裡, 發文時間距離當前時間最遠的日期
|
|
69
|
-
if max_days_ago > days_limit: # 半年內
|
|
70
|
-
return True
|
|
71
|
-
return False
|
|
File without changes
|
{instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/__init__.py
RENAMED
|
File without changes
|
{instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/example.py
RENAMED
|
File without changes
|
|
File without changes
|
{instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/scraper.py
RENAMED
|
File without changes
|
|
File without changes
|
{instagram_posts_scraper-0.0.2 → instagram_posts_scraper-0.0.3}/instagram_posts_scraper/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|