instagram-posts-scraper 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- instagram_posts_scraper/__init__.py +0 -0
- instagram_posts_scraper/example.py +10 -0
- instagram_posts_scraper/file_operation.py +28 -0
- instagram_posts_scraper/instagram_posts_scraper.py +160 -0
- instagram_posts_scraper/parse.py +75 -0
- instagram_posts_scraper/request.py +42 -0
- instagram_posts_scraper/scraper.py +53 -0
- instagram_posts_scraper/utils/__init__.py +0 -0
- instagram_posts_scraper/utils/utils.py +71 -0
- instagram_posts_scraper/utils.py +40 -0
- instagram_posts_scraper-0.0.1.dist-info/LICENSE +19 -0
- instagram_posts_scraper-0.0.1.dist-info/METADATA +104 -0
- instagram_posts_scraper-0.0.1.dist-info/RECORD +15 -0
- instagram_posts_scraper-0.0.1.dist-info/WHEEL +5 -0
- instagram_posts_scraper-0.0.1.dist-info/top_level.txt +1 -0
|
File without changes
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#%%
|
|
3
|
+
from instagram_posts_scraper.instagram_posts_scraper import InstaPeriodScraper
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
target_info = {"username": "joemanweng", "days_limit":5}
|
|
7
|
+
ig_posts_scraper = InstaPeriodScraper()
|
|
8
|
+
res = ig_posts_scraper.get_posts(target_info=target_info)
|
|
9
|
+
res
|
|
10
|
+
# %%
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-.
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
class JsonOperation:
|
|
7
|
+
@staticmethod
|
|
8
|
+
def read_config_content():
|
|
9
|
+
current_directory = os.path.dirname(__file__) + "/resources/config"
|
|
10
|
+
with open(current_directory, 'r') as file:
|
|
11
|
+
result = json.load(file)
|
|
12
|
+
return result
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def read_start_point():
|
|
16
|
+
current_directory = os.path.dirname(__file__) + "/resources/start_point_config"
|
|
17
|
+
with open(current_directory, 'r') as file:
|
|
18
|
+
result = json.load(file)
|
|
19
|
+
return result
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CsvOperation:
|
|
24
|
+
@staticmethod
|
|
25
|
+
def read_kol_csv():
|
|
26
|
+
current_directory = os.path.dirname(__file__) + "/resources/new_input_scraper.csv"
|
|
27
|
+
result = pd.read_csv(current_directory)
|
|
28
|
+
return result
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from instagram_posts_scraper.request import *
|
|
3
|
+
from instagram_posts_scraper.parse import *
|
|
4
|
+
from instagram_posts_scraper.utils import *
|
|
5
|
+
from instagram_posts_scraper.scraper import *
|
|
6
|
+
from instagram_posts_scraper.utils.utils import *
|
|
7
|
+
from instagram_posts_scraper.file_operation import *
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ScrapedDataManager(object):
|
|
11
|
+
def __init__(self):
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class InstaPeriodScraper(object):
|
|
16
|
+
def __init__(self) -> None:
|
|
17
|
+
self.pixwox_request = PixwoxRequest()
|
|
18
|
+
self.parser=Parser()
|
|
19
|
+
self.api_parser=ApiParser()
|
|
20
|
+
self.scraper=Scraper(
|
|
21
|
+
pixwox_request=self.pixwox_request,
|
|
22
|
+
parser=self.parser,
|
|
23
|
+
api_parser=self.api_parser
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def check_account_is_public(self):
|
|
27
|
+
init_response = self.pixwox_request.send_requests(url=self.scraper.init_url)
|
|
28
|
+
self.profile_soup = self.parser.get_soup(response=init_response)
|
|
29
|
+
self.userid = self.parser.get_userid(profile_soup=self.profile_soup)
|
|
30
|
+
self.account_status = get_account_status(userid=self.userid, profile_soup=self.profile_soup)
|
|
31
|
+
return self.account_status == "public"
|
|
32
|
+
|
|
33
|
+
def get_profile(self):
|
|
34
|
+
self.followings = self.parser.get_followings(self.profile_soup)
|
|
35
|
+
self.followers = self.parser.get_followers(self.profile_soup)
|
|
36
|
+
self.counts_of_posts = self.parser.get_counts_of_posts(self.profile_soup)
|
|
37
|
+
try:
|
|
38
|
+
self.introduction = self.parser.get_introduction(self.profile_soup)
|
|
39
|
+
except:
|
|
40
|
+
self.introduction = None
|
|
41
|
+
|
|
42
|
+
self.profile_info = {
|
|
43
|
+
"introduction": self.introduction,
|
|
44
|
+
"counts_of_posts": self.counts_of_posts,
|
|
45
|
+
"followers": self.followers,
|
|
46
|
+
"followings": self.followings,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def get_init_api_data(self):
|
|
50
|
+
init_api_data = self.scraper.get_init_api_data(userid=self.userid)
|
|
51
|
+
return init_api_data
|
|
52
|
+
|
|
53
|
+
def get_next_api_data(self, next_maxid:str, next_:str,username:str):
|
|
54
|
+
next_api_data = self.scraper.get_next_api_data(userid=self.userid, next_maxid=next_maxid, next_=next_, username=username)
|
|
55
|
+
return next_api_data
|
|
56
|
+
|
|
57
|
+
def get_private_account_res(self):
|
|
58
|
+
res = {
|
|
59
|
+
"profile":{
|
|
60
|
+
"userid":self.userid,
|
|
61
|
+
"username":self.target_info["username"],
|
|
62
|
+
"followers":self.followers,
|
|
63
|
+
"followings":self.followings,
|
|
64
|
+
"counts_of_posts":self.counts_of_posts,
|
|
65
|
+
"introduction":self.introduction
|
|
66
|
+
},
|
|
67
|
+
"account_status":self.account_status,
|
|
68
|
+
"updated_at": get_current_time(timezone="Asia/Taipei"),
|
|
69
|
+
"data":[]
|
|
70
|
+
}
|
|
71
|
+
return res
|
|
72
|
+
|
|
73
|
+
def get_missing_account_res(self):
|
|
74
|
+
res = {
|
|
75
|
+
"profile":{
|
|
76
|
+
"userid":None,
|
|
77
|
+
"username":self.target_info["username"],
|
|
78
|
+
"followers":None,
|
|
79
|
+
"followings":None,
|
|
80
|
+
"counts_of_posts":None,
|
|
81
|
+
"introduction":None
|
|
82
|
+
},
|
|
83
|
+
"account_status":self.account_status,
|
|
84
|
+
"updated_at": get_current_time(timezone="Asia/Taipei"),
|
|
85
|
+
"data":[]
|
|
86
|
+
}
|
|
87
|
+
return res
|
|
88
|
+
|
|
89
|
+
def get_public_account_res(self, scraped_posts, init_api_data):
|
|
90
|
+
res = {
|
|
91
|
+
"profile": self.profile_info,
|
|
92
|
+
"account_status":self.account_status,
|
|
93
|
+
"updated_at": get_current_time(timezone="Asia/Taipei"),
|
|
94
|
+
"data":scraped_posts}
|
|
95
|
+
return res
|
|
96
|
+
|
|
97
|
+
# @timeout(300)
|
|
98
|
+
def get_period_data(self, days_limit:int, init_maxid:str, init_api_data, username):
|
|
99
|
+
scraped_posts_res = init_api_data["posts"]["items"]
|
|
100
|
+
next_ = init_api_data["posts"]["next"]
|
|
101
|
+
next_maxid = init_maxid
|
|
102
|
+
for rounds in range(days_limit):
|
|
103
|
+
# Scraped next rounds data
|
|
104
|
+
next_api_data = self.get_next_api_data(next_maxid=next_maxid, next_=next_, username=username)
|
|
105
|
+
scraped_posts = next_api_data["posts"]
|
|
106
|
+
scraped_items = scraped_posts["items"]
|
|
107
|
+
scraped_posts_res += scraped_items
|
|
108
|
+
|
|
109
|
+
# if get all posts or get target period posts
|
|
110
|
+
if has_all_data_been_collected(scraped_items=scraped_posts_res, counts_of_posts=self.counts_of_posts) or is_date_exceed_half_year(scraped_items=scraped_items,days_limit=days_limit):
|
|
111
|
+
return scraped_posts_res
|
|
112
|
+
|
|
113
|
+
elif next_api_data["posts"]["has_next"]: # if there are posts can scrape
|
|
114
|
+
next_maxid = scraped_posts["maxid"]
|
|
115
|
+
next_ = scraped_posts["next"]
|
|
116
|
+
continue
|
|
117
|
+
else:
|
|
118
|
+
return scraped_posts_res
|
|
119
|
+
return scraped_posts_res
|
|
120
|
+
|
|
121
|
+
def get_posts(self, target_info:dict):
|
|
122
|
+
self.target_info = target_info
|
|
123
|
+
self.username = self.target_info["username"]
|
|
124
|
+
username = self.target_info["username"]
|
|
125
|
+
self.scraper.set_username(username)
|
|
126
|
+
days_limit = target_info["days_limit"]
|
|
127
|
+
if not self.check_account_is_public():
|
|
128
|
+
print("This is private account")
|
|
129
|
+
if self.account_status == "private":
|
|
130
|
+
self.get_profile()
|
|
131
|
+
res = self.get_private_account_res()
|
|
132
|
+
return res
|
|
133
|
+
elif self.account_status == "missing":
|
|
134
|
+
res = self.get_missing_account_res()
|
|
135
|
+
return res
|
|
136
|
+
|
|
137
|
+
if self.check_account_is_public():
|
|
138
|
+
init_api_data = self.get_init_api_data() # 帳號資訊 & 上方頁面內容
|
|
139
|
+
self.get_profile()
|
|
140
|
+
print(f"This is public account")
|
|
141
|
+
# can scrape next round's posts
|
|
142
|
+
if init_api_data["posts"]["has_next"] != False:
|
|
143
|
+
maxid = init_api_data["posts"]["maxid"]
|
|
144
|
+
period_posts = self.get_period_data(
|
|
145
|
+
init_maxid=maxid,
|
|
146
|
+
days_limit=days_limit,
|
|
147
|
+
init_api_data=init_api_data,
|
|
148
|
+
username=username
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# return period_posts
|
|
152
|
+
res = self.get_public_account_res(
|
|
153
|
+
scraped_posts=period_posts,
|
|
154
|
+
init_api_data=init_api_data
|
|
155
|
+
)
|
|
156
|
+
return res
|
|
157
|
+
# # no more posts
|
|
158
|
+
elif init_api_data["posts"]["has_next"] == False: # (表示該帳號貼文數<=12, 無法繼續往下找)
|
|
159
|
+
res = self.get_public_account_res(scraped_posts=init_api_data["posts"]["items"], init_api_data=init_api_data)
|
|
160
|
+
return res
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from bs4 import BeautifulSoup
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Parser(object):
|
|
7
|
+
def __init__(self):
|
|
8
|
+
self.__DEFAULT_SOUP_PARSER = "lxml"
|
|
9
|
+
self.__DEFAULT_USER_ELEMENT = (
|
|
10
|
+
"input", {"name": "userid", "type": "hidden"})
|
|
11
|
+
|
|
12
|
+
@staticmethod
|
|
13
|
+
def extract_info_value(item_in):
|
|
14
|
+
return item_in.find('div', class_='num')[
|
|
15
|
+
"title"].replace(',', '')
|
|
16
|
+
|
|
17
|
+
@staticmethod
|
|
18
|
+
def get_followers(profile_soup):
|
|
19
|
+
item_followers = profile_soup.find("div", class_="item_followers")
|
|
20
|
+
return Parser.extract_info_value(item_in=item_followers)
|
|
21
|
+
|
|
22
|
+
@staticmethod
|
|
23
|
+
def get_followings(profile_soup):
|
|
24
|
+
item_followings = profile_soup.find("div", class_="item_following")
|
|
25
|
+
return Parser.extract_info_value(item_in=item_followings)
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def get_counts_of_posts(profile_soup):
|
|
29
|
+
item_posts = profile_soup.find("div", class_="item_posts")
|
|
30
|
+
return Parser.extract_info_value(item_in=item_posts)
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def get_introduction(profile_soup):
|
|
34
|
+
user_info_list = []
|
|
35
|
+
user_info = profile_soup.find("div", class_="info")
|
|
36
|
+
user_info = user_info.find("div", class_="sum")
|
|
37
|
+
for each_info in user_info:
|
|
38
|
+
if each_info.text != ' ':
|
|
39
|
+
user_info_list.append(str(each_info.text))
|
|
40
|
+
return user_info_list
|
|
41
|
+
|
|
42
|
+
def get_soup(self, response):
|
|
43
|
+
soup = BeautifulSoup(response.text, self.__DEFAULT_SOUP_PARSER)
|
|
44
|
+
return soup
|
|
45
|
+
|
|
46
|
+
def get_userid(self, profile_soup):
|
|
47
|
+
userid_input_element = profile_soup.find(
|
|
48
|
+
"input", {"name": "userid",
|
|
49
|
+
"type": "hidden"})
|
|
50
|
+
if userid_input_element:
|
|
51
|
+
return userid_input_element["value"]
|
|
52
|
+
return ""
|
|
53
|
+
|
|
54
|
+
def get_maxid(self, response):
|
|
55
|
+
maxid = json.loads(response.text)["posts"]["maxid"]
|
|
56
|
+
return maxid
|
|
57
|
+
|
|
58
|
+
def get_userid_and_soup(self, response):
|
|
59
|
+
soup = BeautifulSoup(response.text, self.__DEFAULT_SOUP_PARSER)
|
|
60
|
+
userid_input_element = soup.find("input", {"name": "userid",
|
|
61
|
+
"type": "hidden"}
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if userid_input_element:
|
|
65
|
+
return userid_input_element["value"], soup
|
|
66
|
+
|
|
67
|
+
return "", ""
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class ApiParser(object):
|
|
71
|
+
def __init__(self) -> None:
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
def get_maxid(self, scraped_api_data:dict) -> str:
|
|
75
|
+
return scraped_api_data["posts"]["maxid"]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import json
|
|
3
|
+
import cloudscraper
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PixwoxRequest(object):
|
|
8
|
+
def __init__(self):
|
|
9
|
+
self.__DEFAULT_SOUP_PARSER = "lxml"
|
|
10
|
+
self.__scraper = cloudscraper.create_scraper(
|
|
11
|
+
delay=10,
|
|
12
|
+
browser={"custom": "ScraperBot/1.0",
|
|
13
|
+
"platform": "windows",
|
|
14
|
+
"mobile": "False"})
|
|
15
|
+
|
|
16
|
+
def send_requests(self, url):
|
|
17
|
+
response = self.__scraper.get(url)
|
|
18
|
+
return response
|
|
19
|
+
|
|
20
|
+
def get_init_content(self, username: str) -> str:
|
|
21
|
+
get_url = f"https://www.picnob.com/zh-hant/profile/{username}"
|
|
22
|
+
res = self.send_requests(get_url)
|
|
23
|
+
soup = BeautifulSoup(res.text, self.__DEFAULT_SOUP_PARSER)
|
|
24
|
+
userid_input_element = soup.find(
|
|
25
|
+
"input", {"name": "userid", "type": "hidden"})
|
|
26
|
+
|
|
27
|
+
if userid_input_element:
|
|
28
|
+
return userid_input_element["value"], soup
|
|
29
|
+
|
|
30
|
+
return "", ""
|
|
31
|
+
|
|
32
|
+
def get_init_soup(self, profile_response):
|
|
33
|
+
soup = BeautifulSoup(profile_response.text, self.__DEFAULT_SOUP_PARSER)
|
|
34
|
+
return soup
|
|
35
|
+
|
|
36
|
+
def get_maxid(self, response):
|
|
37
|
+
maxid = json.loads(response.text)["posts"]["maxid"]
|
|
38
|
+
return maxid
|
|
39
|
+
|
|
40
|
+
def get_data(self, response):
|
|
41
|
+
scraped_data = json.loads(response.text)
|
|
42
|
+
return scraped_data
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Scraper(object):
|
|
6
|
+
def __init__(self, pixwox_request=None, parser=None,api_parser=None):
|
|
7
|
+
self.pixwox_request = pixwox_request
|
|
8
|
+
self.parser = parser
|
|
9
|
+
self.api_parser = api_parser
|
|
10
|
+
# self.target_info = target_info
|
|
11
|
+
|
|
12
|
+
def set_username(self, username:str):
|
|
13
|
+
self.__username = username
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def username(self):
|
|
17
|
+
return self.__username
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def init_url(self) -> str:
|
|
21
|
+
self._init_url = f"https://www.picnob.com/zh-hant/profile/{self.username}"
|
|
22
|
+
return self._init_url
|
|
23
|
+
|
|
24
|
+
def get_init_api(self, userid:str) -> str:
|
|
25
|
+
self._init_api = f"https://www.piokok.com/api/posts?userid={userid}"
|
|
26
|
+
return self._init_api
|
|
27
|
+
|
|
28
|
+
def get_next_api(self, userid:str, next_maxid:str, next_:str, username:str) -> str:
|
|
29
|
+
next_api = f"https://www.piokok.com/api/posts?username={username}&userid={userid}&next={next_}&maxid={next_maxid}"
|
|
30
|
+
return next_api
|
|
31
|
+
|
|
32
|
+
def send_api(self,api_url:str):
|
|
33
|
+
api_response = self.pixwox_request.send_requests(url=api_url)
|
|
34
|
+
return api_response
|
|
35
|
+
|
|
36
|
+
def get_api_data(self, api_response):
|
|
37
|
+
if api_response.status_code != 200:
|
|
38
|
+
print("status code from init api request is not equal 200")
|
|
39
|
+
return None
|
|
40
|
+
else:
|
|
41
|
+
json_data = json.loads(api_response.content)
|
|
42
|
+
return json_data
|
|
43
|
+
|
|
44
|
+
def get_init_api_data(self,userid:str) -> dict:
|
|
45
|
+
init_api = self.get_init_api(userid=userid)
|
|
46
|
+
init_api_response = self.send_api(api_url=init_api)
|
|
47
|
+
return self.get_api_data(api_response=init_api_response)
|
|
48
|
+
|
|
49
|
+
def get_next_api_data(self, userid:str, next_maxid:str,next_:str, username:str):
|
|
50
|
+
next_api = self.get_next_api(userid=userid, next_maxid=next_maxid,next_=next_, username=username)
|
|
51
|
+
api_response = self.send_api(api_url=next_api)
|
|
52
|
+
next_api_data = self.get_api_data(api_response)
|
|
53
|
+
return next_api_data
|
|
File without changes
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import concurrent.futures as futures
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import pytz
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from functools import wraps
|
|
7
|
+
import time
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def timeit(func):
|
|
11
|
+
@wraps(func)
|
|
12
|
+
def timeit_wrapper(*args, **kwargs):
|
|
13
|
+
start_time = time.perf_counter()
|
|
14
|
+
result = func(*args, **kwargs)
|
|
15
|
+
end_time = time.perf_counter()
|
|
16
|
+
total_time = end_time - start_time
|
|
17
|
+
print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
|
|
18
|
+
return result
|
|
19
|
+
return timeit_wrapper
|
|
20
|
+
|
|
21
|
+
def timeout(timelimit):
|
|
22
|
+
def decorator(func):
|
|
23
|
+
def decorated(*args, **kwargs):
|
|
24
|
+
with futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
25
|
+
future = executor.submit(func, *args, **kwargs)
|
|
26
|
+
try:
|
|
27
|
+
result = future.result(timelimit)
|
|
28
|
+
except futures.TimeoutError:
|
|
29
|
+
print('Time out!')
|
|
30
|
+
raise TimeoutError from None
|
|
31
|
+
else:
|
|
32
|
+
print(result)
|
|
33
|
+
executor._threads.clear()
|
|
34
|
+
futures.thread._threads_queues.clear()
|
|
35
|
+
return result
|
|
36
|
+
return decorated
|
|
37
|
+
return decorator
|
|
38
|
+
|
|
39
|
+
def get_current_time(timezone="Asia/Taipei"):
|
|
40
|
+
current_time_utc = datetime.utcnow()
|
|
41
|
+
target_timezone = pytz.timezone(timezone)
|
|
42
|
+
target_current_time = current_time_utc.replace(
|
|
43
|
+
tzinfo=pytz.utc).astimezone(target_timezone)
|
|
44
|
+
return target_current_time
|
|
45
|
+
|
|
46
|
+
def get_account_status(userid, profile_soup=None):
|
|
47
|
+
if userid == "":
|
|
48
|
+
return "missing"
|
|
49
|
+
else:
|
|
50
|
+
private_span = profile_soup.find(
|
|
51
|
+
"span", class_="ident private icon icon_lock")
|
|
52
|
+
if private_span:
|
|
53
|
+
return "private"
|
|
54
|
+
return "public"
|
|
55
|
+
|
|
56
|
+
def has_all_data_been_collected(scraped_items:pd.DataFrame,counts_of_posts):
|
|
57
|
+
"""Whether program get all posts already."""
|
|
58
|
+
if len(set([each["shortcode"] for each in scraped_items])) >= int(counts_of_posts):
|
|
59
|
+
return True
|
|
60
|
+
return False
|
|
61
|
+
|
|
62
|
+
def is_date_exceed_half_year(scraped_items:pd.DataFrame, days_limit:int):
|
|
63
|
+
"""Check if scraped posts' published date exceed half year"""
|
|
64
|
+
current_time = datetime.now()
|
|
65
|
+
days_ago_list = [int(
|
|
66
|
+
(current_time - pd.to_datetime(each["time"], unit="s")).days) for each in scraped_items]
|
|
67
|
+
|
|
68
|
+
max_days_ago = max(days_ago_list) # 爬到的貼文裡, 發文時間距離當前時間最遠的日期
|
|
69
|
+
if max_days_ago > days_limit: # 半年內
|
|
70
|
+
return True
|
|
71
|
+
return False
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import concurrent.futures as futures
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import pytz
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def timeout(timelimit):
|
|
8
|
+
def decorator(func):
|
|
9
|
+
def decorated(*args, **kwargs):
|
|
10
|
+
with futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
11
|
+
future = executor.submit(func, *args, **kwargs)
|
|
12
|
+
try:
|
|
13
|
+
result = future.result(timelimit)
|
|
14
|
+
except futures.TimeoutError:
|
|
15
|
+
print('Time out!')
|
|
16
|
+
raise TimeoutError from None
|
|
17
|
+
else:
|
|
18
|
+
print(result)
|
|
19
|
+
executor._threads.clear()
|
|
20
|
+
futures.thread._threads_queues.clear()
|
|
21
|
+
return result
|
|
22
|
+
return decorated
|
|
23
|
+
return decorator
|
|
24
|
+
|
|
25
|
+
def get_current_time(timezone="Asia/Taipei"):
|
|
26
|
+
current_time_utc = datetime.utcnow()
|
|
27
|
+
target_timezone = pytz.timezone(timezone)
|
|
28
|
+
target_current_time = current_time_utc.replace(
|
|
29
|
+
tzinfo=pytz.utc).astimezone(target_timezone)
|
|
30
|
+
return target_current_time
|
|
31
|
+
|
|
32
|
+
def get_account_status(userid, profile_soup=None):
|
|
33
|
+
if userid == "":
|
|
34
|
+
return "missing"
|
|
35
|
+
else:
|
|
36
|
+
private_span = profile_soup.find(
|
|
37
|
+
"span", class_="ident private icon icon_lock")
|
|
38
|
+
if private_span:
|
|
39
|
+
return "private"
|
|
40
|
+
return "public"
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) <2025> <FaustRen>
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: instagram-posts-scraper
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Implement Instagram Posts Scraper for post data retrieval
|
|
5
|
+
Home-page: https://github.com/FaustRen/instagram_posts_scraper
|
|
6
|
+
Author: FaustRen
|
|
7
|
+
Author-email: faustren1z@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.11
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: author-email
|
|
17
|
+
Dynamic: classifier
|
|
18
|
+
Dynamic: description
|
|
19
|
+
Dynamic: description-content-type
|
|
20
|
+
Dynamic: home-page
|
|
21
|
+
Dynamic: license
|
|
22
|
+
Dynamic: requires-python
|
|
23
|
+
Dynamic: summary
|
|
24
|
+
|
|
25
|
+
# Instagram Posts Scraper
|
|
26
|
+
|
|
27
|
+
InstagramPostsScraper is a Python library for collect instagram users' data.
|
|
28
|
+
|
|
29
|
+
The data obtained by web crawlers is not real-time data, but rather data from a specific point in time on the same day.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
Use the package manager [pip](https://pip.pypa.io/en/stable/) to install instagram_posts_scraper.
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
beautifulsoup4==4.13.3
|
|
37
|
+
cloudscraper==1.2.71
|
|
38
|
+
pandas==2.2.3
|
|
39
|
+
pytz==2025.1
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
## Usage - Sample
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from instagram_posts_scraper import InstaPeriodScraper
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
target_info = {"username": "joemanweng", "days_limit":5}
|
|
50
|
+
ig_posts_scraper = InstaPeriodScraper()
|
|
51
|
+
res = ig_posts_scraper.get_posts(target_info=target_info)
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Result Sample
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
{
|
|
59
|
+
'profile': {
|
|
60
|
+
'introduction': ['台灣Youtuber\n⬇️專屬團購連結⬇️'],
|
|
61
|
+
'counts_of_posts': '1392',
|
|
62
|
+
'followers': '581198',
|
|
63
|
+
'followings': '859'
|
|
64
|
+
},
|
|
65
|
+
'account_status': 'public',
|
|
66
|
+
'updated_at': datetime.datetime(2025, 2, 9, 1, 28, 8, 793770, tzinfo=<DstTzInfo 'Asia/Taipei' CST+8:00:00 STD>),
|
|
67
|
+
'data': [
|
|
68
|
+
{
|
|
69
|
+
'type': 'igtv',
|
|
70
|
+
'sum': '《靈能的挑戰》參賽者後台花絮露出👀\n老師們比賽中較勁,但私下娛樂竟是互相算命!\n不知道有沒有先算出冠軍了(吃瓜)\n預祝大家新年快樂~~\n*節目中個人言論不代表本節目立場*\n﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏\n#靈能的挑戰\xa0#Joeman\xa0#東森超視33頻道\n全台第一檔靈能競賽節目\n📺️ 全季觀看平台|\n🎥 Joeman YT 頻道觀看全季\n🎥 東森超視 33 頻道已播出完畢\n﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋\n#玄學 #易經 #塔羅 #通靈 #占星',
|
|
71
|
+
'sum_pure': '《靈能的挑戰》參賽者後台花絮露出 老師們比賽中較勁,但私下娛樂竟是互相算命!不知道有沒有先算出冠軍了(吃瓜)預祝大家新年快樂~~*節目中個人言論不代表本節目立場*﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏#靈能的挑戰\xa0#Joeman\xa0#東森超視33頻道全台第一檔靈能競賽節目 ️ 全季觀看平台| Joeman YT 頻道觀看全季 東森超視 33 頻道已播出完畢﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋#玄學 #易經 #塔羅 #通靈 #占星',
|
|
72
|
+
'shortcode': '6741582137415658464423',
|
|
73
|
+
'time': 1737885600,
|
|
74
|
+
'ftime': '13 days ago',
|
|
75
|
+
'count_like': 2442,
|
|
76
|
+
'count_comment': 6,
|
|
77
|
+
'count_like_pure': '2,442',
|
|
78
|
+
'count_comment_pure': '6',
|
|
79
|
+
'thum': 'https://scontent-fra3-1.cdninstagram.com/v/t51.2885-15/474907887_1129543381888335_8766424988966118915_n.jpg?stp=c0.248.640.640a_dst-jpg_e15_tt6&_nc_ht=scontent-fra3-1.cdninstagram.com&_nc_cat=103&_nc_ohc=YPiycXP_teMQ7kNvgF-DLaF&_nc_gid=3e3699c22b4246b98fb450d120d9527f&edm=APU89FABAAAA&ccb=7-5&oh=00_AYAb75YGceozAPWzbq4CQQGd_q_XyfKdjdwcKYak3J-pHA&oe=67AD07FF&_nc_sid=bc0c2c',
|
|
80
|
+
'pic': 'https://scontent-fra3-1.cdninstagram.com/v/t51.2885-15/474907887_1129543381888335_8766424988966118915_n.jpg?stp=dst-jpg_e15_tt6&_nc_ht=scontent-fra3-1.cdninstagram.com&_nc_cat=103&_nc_ohc=YPiycXP_teMQ7kNvgF-DLaF&_nc_gid=3e3699c22b4246b98fb450d120d9527f&edm=APU89FABAAAA&ccb=7-5&oh=00_AYA0qczIKFhJEbukWZRRSUvJJ28iqBuhIXZqRB3EEA450Q&oe=67AD07FF&_nc_sid=bc0c2c',
|
|
81
|
+
'pic_p': 'https://sp1.piokok.com/p/pt_6741582137415658464423_0_233bd378e8b9a4f088f6c850099951c7.jpg?u=https%3A%2F%2Fscontent-fra3-1.cdninstagram.com%2Fv%2Ft51.2885-15%2F474907887_1129543381888335_8766424988966118915_n.jpg%3Fstp%3Dc0.248.640.640a_dst-jpg_e15_tt6%26_nc_ht%3Dscontent-fra3-1.cdninstagram.com%26_nc_cat%3D103%26_nc_ohc%3DYPiycXP_teMQ7kNvgF-DLaF%26_nc_gid%3D3e3699c22b4246b98fb450d120d9527f%26edm%3DAPU89FABAAAA%26ccb%3D7-5%26oh%3D00_AYAb75YGceozAPWzbq4CQQGd_q_XyfKdjdwcKYak3J-pHA%26oe%3D67AD07FF%26_nc_sid%3Dbc0c2c',
|
|
82
|
+
'down_pic': 'https://scontent-fra3-1.cdninstagram.com/v/t51.2885-15/474907887_1129543381888335_8766424988966118915_n.jpg?stp=dst-jpg_e15_tt6&_nc_ht=scontent-fra3-1.cdninstagram.com&_nc_cat=103&_nc_ohc=YPiycXP_teMQ7kNvgF-DLaF&_nc_gid=3e3699c22b4246b98fb450d120d9527f&edm=APU89FABAAAA&ccb=7-5&oh=00_AYA0qczIKFhJEbukWZRRSUvJJ28iqBuhIXZqRB3EEA450Q&oe=67AD07FF&_nc_sid=bc0c2c&dl=1',
|
|
83
|
+
'is_video': True,
|
|
84
|
+
'video': 'https://scontent-fra3-1.cdninstagram.com/o1/v/t16/f2/m86/AQNbkAdWefxU8OJ015A7RX0oUxLJS-03KKFEbd2ueAbuWXS5jla4AsgRPhGeuuD9HsvVUz0mG5uKwOTWpYhvCVbbIPtxarY3vAIwG7A.mp4?stp=dst-mp4&efg=eyJxZV9ncm91cHMiOiJbXCJpZ193ZWJfZGVsaXZlcnlfdnRzX290ZlwiXSIsInZlbmNvZGVfdGFnIjoidnRzX3ZvZF91cmxnZW4uY2xpcHMuYzIuNzIwLmJhc2VsaW5lIn0&_nc_cat=103&vs=427033163733715_1223826287&_nc_vs=HBksFQIYUmlnX3hwdl9yZWVsc19wZXJtYW5lbnRfc3JfcHJvZC9ERDQ1NERCQUU1RkU4RDA5MzA5NzA2Q0IzODgxRUVBM192aWRlb19kYXNoaW5pdC5tcDQVAALIAQAVAhg6cGFzc3Rocm91Z2hfZXZlcnN0b3JlL0dPcGZUeHgyY1EtV3l3c0VBQjRRdldDeTRZdHFicV9FQUFBRhUCAsgBACgAGAAbABUAACamtPfZ0u%2BTQBUCKAJDMywXQF2QEGJN0vIYEmRhc2hfYmFzZWxpbmVfMV92MREAdf4HAA%3D%3D&_nc_rid=3e36937551&ccb=9-4&oh=00_AYANPnQ7Zlk5Z_6wS0OF1Hn3ttS8vsk-0K6OyM9y8xzM6A&oe=67A925DC&_nc_sid=bc0c2c',
|
|
85
|
+
'down_video': 'same_as_above'
|
|
86
|
+
}
|
|
87
|
+
]
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
## Contributing - Sample
|
|
94
|
+
|
|
95
|
+
comming soon..
|
|
96
|
+
|
|
97
|
+
## License - Sample
|
|
98
|
+
|
|
99
|
+
comming soon..
|
|
100
|
+
|
|
101
|
+
## Tests - cd to tests folder
|
|
102
|
+
coverage run test_crawler.py
|
|
103
|
+
|
|
104
|
+
coverage html
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
instagram_posts_scraper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
instagram_posts_scraper/example.py,sha256=xk5J0BfP_9iC7Q1Ys5kSXEPZYhCe3TCoKMV-DmDt9CM,273
|
|
3
|
+
instagram_posts_scraper/file_operation.py,sha256=O7VnbEdjigT81-wO7_rwkeKBt1SornHKblMYeOQs9zw,803
|
|
4
|
+
instagram_posts_scraper/instagram_posts_scraper.py,sha256=aFE_5bVyErQy9G_bD5j7dDnRcXaLapFn-TDnr1K_H2w,6536
|
|
5
|
+
instagram_posts_scraper/parse.py,sha256=JnNHiOtuCFsgSFBHesEY0kz7eW4yQeZsWRLvEdmUalA,2464
|
|
6
|
+
instagram_posts_scraper/request.py,sha256=igMGrpnT9ua5aP3_L73HmEJuPrxVhX_hPGWHgne5T5k,1326
|
|
7
|
+
instagram_posts_scraper/scraper.py,sha256=GREoMxLzGvCEk5lffb1FshMGjlFsY5pQAVe6-CrhSZ4,1984
|
|
8
|
+
instagram_posts_scraper/utils.py,sha256=8Z882DyPS0c2ElqkdiWr02168ZYvGU_wIYZSWUAb9Ac,1323
|
|
9
|
+
instagram_posts_scraper/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
instagram_posts_scraper/utils/utils.py,sha256=IKcoXg8K2uXEHL59nRooRd0zO8TRU3by81SVOaP7u0k,2504
|
|
11
|
+
instagram_posts_scraper-0.0.1.dist-info/LICENSE,sha256=CsjVOvSrAVRgkvWnFEjgzi931MNvPnOIw5THMXJIaCY,1055
|
|
12
|
+
instagram_posts_scraper-0.0.1.dist-info/METADATA,sha256=HHzGTbndTXWSCA6iMCbkcgMqg4zVtZQW0LU4qJUIA9s,5933
|
|
13
|
+
instagram_posts_scraper-0.0.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
14
|
+
instagram_posts_scraper-0.0.1.dist-info/top_level.txt,sha256=_llg6kHUB_KfCFM9S6Mn5QRs2AJyKCJbsJNb105hcNg,24
|
|
15
|
+
instagram_posts_scraper-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
instagram_posts_scraper
|