instagram-posts-scraper 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ Copyright (c) <2025> <FaustRen>
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,104 @@
1
+ Metadata-Version: 2.2
2
+ Name: instagram-posts-scraper
3
+ Version: 0.0.1
4
+ Summary: Implement Instagram Posts Scraper for post data retrieval
5
+ Home-page: https://github.com/FaustRen/instagram_posts_scraper
6
+ Author: FaustRen
7
+ Author-email: faustren1z@gmail.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.11
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Dynamic: author
16
+ Dynamic: author-email
17
+ Dynamic: classifier
18
+ Dynamic: description
19
+ Dynamic: description-content-type
20
+ Dynamic: home-page
21
+ Dynamic: license
22
+ Dynamic: requires-python
23
+ Dynamic: summary
24
+
25
+ # Instagram Posts Scraper
26
+
27
+ InstagramPostsScraper is a Python library for collect instagram users' data.
28
+
29
+ The data obtained by web crawlers is not real-time data, but rather data from a specific point in time on the same day.
30
+
31
+ ## Installation
32
+
33
+ Use the package manager [pip](https://pip.pypa.io/en/stable/) to install instagram_posts_scraper.
34
+
35
+ ```bash
36
+ beautifulsoup4==4.13.3
37
+ cloudscraper==1.2.71
38
+ pandas==2.2.3
39
+ pytz==2025.1
40
+ ```
41
+
42
+
43
+ ## Usage - Sample
44
+
45
+ ```python
46
+ from instagram_posts_scraper import InstaPeriodScraper
47
+
48
+
49
+ target_info = {"username": "joemanweng", "days_limit":5}
50
+ ig_posts_scraper = InstaPeriodScraper()
51
+ res = ig_posts_scraper.get_posts(target_info=target_info)
52
+
53
+ ```
54
+
55
+ ## Result Sample
56
+
57
+ ```
58
+ {
59
+ 'profile': {
60
+ 'introduction': ['台灣Youtuber\n⬇️專屬團購連結⬇️'],
61
+ 'counts_of_posts': '1392',
62
+ 'followers': '581198',
63
+ 'followings': '859'
64
+ },
65
+ 'account_status': 'public',
66
+ 'updated_at': datetime.datetime(2025, 2, 9, 1, 28, 8, 793770, tzinfo=<DstTzInfo 'Asia/Taipei' CST+8:00:00 STD>),
67
+ 'data': [
68
+ {
69
+ 'type': 'igtv',
70
+ 'sum': '《靈能的挑戰》參賽者後台花絮露出👀\n老師們比賽中較勁,但私下娛樂竟是互相算命!\n不知道有沒有先算出冠軍了(吃瓜)\n預祝大家新年快樂~~\n*節目中個人言論不代表本節目立場*\n﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏\n#靈能的挑戰\xa0#Joeman\xa0#東森超視33頻道\n全台第一檔靈能競賽節目\n📺️ 全季觀看平台|\n🎥 Joeman YT 頻道觀看全季\n🎥 東森超視 33 頻道已播出完畢\n﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋\n#玄學 #易經 #塔羅 #通靈 #占星',
71
+ 'sum_pure': '《靈能的挑戰》參賽者後台花絮露出 老師們比賽中較勁,但私下娛樂竟是互相算命!不知道有沒有先算出冠軍了(吃瓜)預祝大家新年快樂~~*節目中個人言論不代表本節目立場*﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏#靈能的挑戰\xa0#Joeman\xa0#東森超視33頻道全台第一檔靈能競賽節目 ️ 全季觀看平台| Joeman YT 頻道觀看全季 東森超視 33 頻道已播出完畢﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋#玄學 #易經 #塔羅 #通靈 #占星',
72
+ 'shortcode': '6741582137415658464423',
73
+ 'time': 1737885600,
74
+ 'ftime': '13 days ago',
75
+ 'count_like': 2442,
76
+ 'count_comment': 6,
77
+ 'count_like_pure': '2,442',
78
+ 'count_comment_pure': '6',
79
+ 'thum': 'https://scontent-fra3-1.cdninstagram.com/v/t51.2885-15/474907887_1129543381888335_8766424988966118915_n.jpg?stp=c0.248.640.640a_dst-jpg_e15_tt6&_nc_ht=scontent-fra3-1.cdninstagram.com&_nc_cat=103&_nc_ohc=YPiycXP_teMQ7kNvgF-DLaF&_nc_gid=3e3699c22b4246b98fb450d120d9527f&edm=APU89FABAAAA&ccb=7-5&oh=00_AYAb75YGceozAPWzbq4CQQGd_q_XyfKdjdwcKYak3J-pHA&oe=67AD07FF&_nc_sid=bc0c2c',
80
+ 'pic': 'https://scontent-fra3-1.cdninstagram.com/v/t51.2885-15/474907887_1129543381888335_8766424988966118915_n.jpg?stp=dst-jpg_e15_tt6&_nc_ht=scontent-fra3-1.cdninstagram.com&_nc_cat=103&_nc_ohc=YPiycXP_teMQ7kNvgF-DLaF&_nc_gid=3e3699c22b4246b98fb450d120d9527f&edm=APU89FABAAAA&ccb=7-5&oh=00_AYA0qczIKFhJEbukWZRRSUvJJ28iqBuhIXZqRB3EEA450Q&oe=67AD07FF&_nc_sid=bc0c2c',
81
+ 'pic_p': 'https://sp1.piokok.com/p/pt_6741582137415658464423_0_233bd378e8b9a4f088f6c850099951c7.jpg?u=https%3A%2F%2Fscontent-fra3-1.cdninstagram.com%2Fv%2Ft51.2885-15%2F474907887_1129543381888335_8766424988966118915_n.jpg%3Fstp%3Dc0.248.640.640a_dst-jpg_e15_tt6%26_nc_ht%3Dscontent-fra3-1.cdninstagram.com%26_nc_cat%3D103%26_nc_ohc%3DYPiycXP_teMQ7kNvgF-DLaF%26_nc_gid%3D3e3699c22b4246b98fb450d120d9527f%26edm%3DAPU89FABAAAA%26ccb%3D7-5%26oh%3D00_AYAb75YGceozAPWzbq4CQQGd_q_XyfKdjdwcKYak3J-pHA%26oe%3D67AD07FF%26_nc_sid%3Dbc0c2c',
82
+ 'down_pic': 'https://scontent-fra3-1.cdninstagram.com/v/t51.2885-15/474907887_1129543381888335_8766424988966118915_n.jpg?stp=dst-jpg_e15_tt6&_nc_ht=scontent-fra3-1.cdninstagram.com&_nc_cat=103&_nc_ohc=YPiycXP_teMQ7kNvgF-DLaF&_nc_gid=3e3699c22b4246b98fb450d120d9527f&edm=APU89FABAAAA&ccb=7-5&oh=00_AYA0qczIKFhJEbukWZRRSUvJJ28iqBuhIXZqRB3EEA450Q&oe=67AD07FF&_nc_sid=bc0c2c&dl=1',
83
+ 'is_video': True,
84
+ 'video': 'https://scontent-fra3-1.cdninstagram.com/o1/v/t16/f2/m86/AQNbkAdWefxU8OJ015A7RX0oUxLJS-03KKFEbd2ueAbuWXS5jla4AsgRPhGeuuD9HsvVUz0mG5uKwOTWpYhvCVbbIPtxarY3vAIwG7A.mp4?stp=dst-mp4&efg=eyJxZV9ncm91cHMiOiJbXCJpZ193ZWJfZGVsaXZlcnlfdnRzX290ZlwiXSIsInZlbmNvZGVfdGFnIjoidnRzX3ZvZF91cmxnZW4uY2xpcHMuYzIuNzIwLmJhc2VsaW5lIn0&_nc_cat=103&vs=427033163733715_1223826287&_nc_vs=HBksFQIYUmlnX3hwdl9yZWVsc19wZXJtYW5lbnRfc3JfcHJvZC9ERDQ1NERCQUU1RkU4RDA5MzA5NzA2Q0IzODgxRUVBM192aWRlb19kYXNoaW5pdC5tcDQVAALIAQAVAhg6cGFzc3Rocm91Z2hfZXZlcnN0b3JlL0dPcGZUeHgyY1EtV3l3c0VBQjRRdldDeTRZdHFicV9FQUFBRhUCAsgBACgAGAAbABUAACamtPfZ0u%2BTQBUCKAJDMywXQF2QEGJN0vIYEmRhc2hfYmFzZWxpbmVfMV92MREAdf4HAA%3D%3D&_nc_rid=3e36937551&ccb=9-4&oh=00_AYANPnQ7Zlk5Z_6wS0OF1Hn3ttS8vsk-0K6OyM9y8xzM6A&oe=67A925DC&_nc_sid=bc0c2c',
85
+ 'down_video': 'same_as_above'
86
+ }
87
+ ]
88
+ }
89
+
90
+
91
+
92
+
93
+ ## Contributing - Sample
94
+
95
+ comming soon..
96
+
97
+ ## License - Sample
98
+
99
+ comming soon..
100
+
101
+ ## Tests - cd to tests folder
102
+ coverage run test_crawler.py
103
+
104
+ coverage html
@@ -0,0 +1,10 @@
1
+ # -*- coding: utf-8 -*-
2
+ #%%
3
+ from instagram_posts_scraper.instagram_posts_scraper import InstaPeriodScraper
4
+
5
+
6
+ target_info = {"username": "joemanweng", "days_limit":5}
7
+ ig_posts_scraper = InstaPeriodScraper()
8
+ res = ig_posts_scraper.get_posts(target_info=target_info)
9
+ res
10
+ # %%
@@ -0,0 +1,28 @@
1
+ # -*- coding: utf-8 -*-.
2
+ import os
3
+ import json
4
+ import pandas as pd
5
+
6
+ class JsonOperation:
7
+ @staticmethod
8
+ def read_config_content():
9
+ current_directory = os.path.dirname(__file__) + "/resources/config"
10
+ with open(current_directory, 'r') as file:
11
+ result = json.load(file)
12
+ return result
13
+
14
+ @staticmethod
15
+ def read_start_point():
16
+ current_directory = os.path.dirname(__file__) + "/resources/start_point_config"
17
+ with open(current_directory, 'r') as file:
18
+ result = json.load(file)
19
+ return result
20
+
21
+
22
+
23
+ class CsvOperation:
24
+ @staticmethod
25
+ def read_kol_csv():
26
+ current_directory = os.path.dirname(__file__) + "/resources/new_input_scraper.csv"
27
+ result = pd.read_csv(current_directory)
28
+ return result
@@ -0,0 +1,160 @@
1
+ # -*- coding: utf-8 -*-
2
+ from instagram_posts_scraper.request import *
3
+ from instagram_posts_scraper.parse import *
4
+ from instagram_posts_scraper.utils import *
5
+ from instagram_posts_scraper.scraper import *
6
+ from instagram_posts_scraper.utils.utils import *
7
+ from instagram_posts_scraper.file_operation import *
8
+
9
+
10
+ class ScrapedDataManager(object):
11
+ def __init__(self):
12
+ pass
13
+
14
+
15
+ class InstaPeriodScraper(object):
16
+ def __init__(self) -> None:
17
+ self.pixwox_request = PixwoxRequest()
18
+ self.parser=Parser()
19
+ self.api_parser=ApiParser()
20
+ self.scraper=Scraper(
21
+ pixwox_request=self.pixwox_request,
22
+ parser=self.parser,
23
+ api_parser=self.api_parser
24
+ )
25
+
26
+ def check_account_is_public(self):
27
+ init_response = self.pixwox_request.send_requests(url=self.scraper.init_url)
28
+ self.profile_soup = self.parser.get_soup(response=init_response)
29
+ self.userid = self.parser.get_userid(profile_soup=self.profile_soup)
30
+ self.account_status = get_account_status(userid=self.userid, profile_soup=self.profile_soup)
31
+ return self.account_status == "public"
32
+
33
+ def get_profile(self):
34
+ self.followings = self.parser.get_followings(self.profile_soup)
35
+ self.followers = self.parser.get_followers(self.profile_soup)
36
+ self.counts_of_posts = self.parser.get_counts_of_posts(self.profile_soup)
37
+ try:
38
+ self.introduction = self.parser.get_introduction(self.profile_soup)
39
+ except:
40
+ self.introduction = None
41
+
42
+ self.profile_info = {
43
+ "introduction": self.introduction,
44
+ "counts_of_posts": self.counts_of_posts,
45
+ "followers": self.followers,
46
+ "followings": self.followings,
47
+ }
48
+
49
+ def get_init_api_data(self):
50
+ init_api_data = self.scraper.get_init_api_data(userid=self.userid)
51
+ return init_api_data
52
+
53
+ def get_next_api_data(self, next_maxid:str, next_:str,username:str):
54
+ next_api_data = self.scraper.get_next_api_data(userid=self.userid, next_maxid=next_maxid, next_=next_, username=username)
55
+ return next_api_data
56
+
57
+ def get_private_account_res(self):
58
+ res = {
59
+ "profile":{
60
+ "userid":self.userid,
61
+ "username":self.target_info["username"],
62
+ "followers":self.followers,
63
+ "followings":self.followings,
64
+ "counts_of_posts":self.counts_of_posts,
65
+ "introduction":self.introduction
66
+ },
67
+ "account_status":self.account_status,
68
+ "updated_at": get_current_time(timezone="Asia/Taipei"),
69
+ "data":[]
70
+ }
71
+ return res
72
+
73
+ def get_missing_account_res(self):
74
+ res = {
75
+ "profile":{
76
+ "userid":None,
77
+ "username":self.target_info["username"],
78
+ "followers":None,
79
+ "followings":None,
80
+ "counts_of_posts":None,
81
+ "introduction":None
82
+ },
83
+ "account_status":self.account_status,
84
+ "updated_at": get_current_time(timezone="Asia/Taipei"),
85
+ "data":[]
86
+ }
87
+ return res
88
+
89
+ def get_public_account_res(self, scraped_posts, init_api_data):
90
+ res = {
91
+ "profile": self.profile_info,
92
+ "account_status":self.account_status,
93
+ "updated_at": get_current_time(timezone="Asia/Taipei"),
94
+ "data":scraped_posts}
95
+ return res
96
+
97
+ # @timeout(300)
98
+ def get_period_data(self, days_limit:int, init_maxid:str, init_api_data, username):
99
+ scraped_posts_res = init_api_data["posts"]["items"]
100
+ next_ = init_api_data["posts"]["next"]
101
+ next_maxid = init_maxid
102
+ for rounds in range(days_limit):
103
+ # Scraped next rounds data
104
+ next_api_data = self.get_next_api_data(next_maxid=next_maxid, next_=next_, username=username)
105
+ scraped_posts = next_api_data["posts"]
106
+ scraped_items = scraped_posts["items"]
107
+ scraped_posts_res += scraped_items
108
+
109
+ # if get all posts or get target period posts
110
+ if has_all_data_been_collected(scraped_items=scraped_posts_res, counts_of_posts=self.counts_of_posts) or is_date_exceed_half_year(scraped_items=scraped_items,days_limit=days_limit):
111
+ return scraped_posts_res
112
+
113
+ elif next_api_data["posts"]["has_next"]: # if there are posts can scrape
114
+ next_maxid = scraped_posts["maxid"]
115
+ next_ = scraped_posts["next"]
116
+ continue
117
+ else:
118
+ return scraped_posts_res
119
+ return scraped_posts_res
120
+
121
+ def get_posts(self, target_info:dict):
122
+ self.target_info = target_info
123
+ self.username = self.target_info["username"]
124
+ username = self.target_info["username"]
125
+ self.scraper.set_username(username)
126
+ days_limit = target_info["days_limit"]
127
+ if not self.check_account_is_public():
128
+ print("This is private account")
129
+ if self.account_status == "private":
130
+ self.get_profile()
131
+ res = self.get_private_account_res()
132
+ return res
133
+ elif self.account_status == "missing":
134
+ res = self.get_missing_account_res()
135
+ return res
136
+
137
+ if self.check_account_is_public():
138
+ init_api_data = self.get_init_api_data() # 帳號資訊 & 上方頁面內容
139
+ self.get_profile()
140
+ print(f"This is public account")
141
+ # can scrape next round's posts
142
+ if init_api_data["posts"]["has_next"] != False:
143
+ maxid = init_api_data["posts"]["maxid"]
144
+ period_posts = self.get_period_data(
145
+ init_maxid=maxid,
146
+ days_limit=days_limit,
147
+ init_api_data=init_api_data,
148
+ username=username
149
+ )
150
+
151
+ # return period_posts
152
+ res = self.get_public_account_res(
153
+ scraped_posts=period_posts,
154
+ init_api_data=init_api_data
155
+ )
156
+ return res
157
+ # # no more posts
158
+ elif init_api_data["posts"]["has_next"] == False: # (表示該帳號貼文數<=12, 無法繼續往下找)
159
+ res = self.get_public_account_res(scraped_posts=init_api_data["posts"]["items"], init_api_data=init_api_data)
160
+ return res
@@ -0,0 +1,75 @@
1
+ # -*- coding: utf-8 -*-
2
+ from bs4 import BeautifulSoup
3
+ import json
4
+
5
+
6
+ class Parser(object):
7
+ def __init__(self):
8
+ self.__DEFAULT_SOUP_PARSER = "lxml"
9
+ self.__DEFAULT_USER_ELEMENT = (
10
+ "input", {"name": "userid", "type": "hidden"})
11
+
12
+ @staticmethod
13
+ def extract_info_value(item_in):
14
+ return item_in.find('div', class_='num')[
15
+ "title"].replace(',', '')
16
+
17
+ @staticmethod
18
+ def get_followers(profile_soup):
19
+ item_followers = profile_soup.find("div", class_="item_followers")
20
+ return Parser.extract_info_value(item_in=item_followers)
21
+
22
+ @staticmethod
23
+ def get_followings(profile_soup):
24
+ item_followings = profile_soup.find("div", class_="item_following")
25
+ return Parser.extract_info_value(item_in=item_followings)
26
+
27
+ @staticmethod
28
+ def get_counts_of_posts(profile_soup):
29
+ item_posts = profile_soup.find("div", class_="item_posts")
30
+ return Parser.extract_info_value(item_in=item_posts)
31
+
32
+ @staticmethod
33
+ def get_introduction(profile_soup):
34
+ user_info_list = []
35
+ user_info = profile_soup.find("div", class_="info")
36
+ user_info = user_info.find("div", class_="sum")
37
+ for each_info in user_info:
38
+ if each_info.text != ' ':
39
+ user_info_list.append(str(each_info.text))
40
+ return user_info_list
41
+
42
+ def get_soup(self, response):
43
+ soup = BeautifulSoup(response.text, self.__DEFAULT_SOUP_PARSER)
44
+ return soup
45
+
46
+ def get_userid(self, profile_soup):
47
+ userid_input_element = profile_soup.find(
48
+ "input", {"name": "userid",
49
+ "type": "hidden"})
50
+ if userid_input_element:
51
+ return userid_input_element["value"]
52
+ return ""
53
+
54
+ def get_maxid(self, response):
55
+ maxid = json.loads(response.text)["posts"]["maxid"]
56
+ return maxid
57
+
58
+ def get_userid_and_soup(self, response):
59
+ soup = BeautifulSoup(response.text, self.__DEFAULT_SOUP_PARSER)
60
+ userid_input_element = soup.find("input", {"name": "userid",
61
+ "type": "hidden"}
62
+ )
63
+
64
+ if userid_input_element:
65
+ return userid_input_element["value"], soup
66
+
67
+ return "", ""
68
+
69
+
70
+ class ApiParser(object):
71
+ def __init__(self) -> None:
72
+ pass
73
+
74
+ def get_maxid(self, scraped_api_data:dict) -> str:
75
+ return scraped_api_data["posts"]["maxid"]
@@ -0,0 +1,42 @@
1
+ # -*- coding: utf-8 -*-
2
+ import json
3
+ import cloudscraper
4
+ from bs4 import BeautifulSoup
5
+
6
+
7
+ class PixwoxRequest(object):
8
+ def __init__(self):
9
+ self.__DEFAULT_SOUP_PARSER = "lxml"
10
+ self.__scraper = cloudscraper.create_scraper(
11
+ delay=10,
12
+ browser={"custom": "ScraperBot/1.0",
13
+ "platform": "windows",
14
+ "mobile": "False"})
15
+
16
+ def send_requests(self, url):
17
+ response = self.__scraper.get(url)
18
+ return response
19
+
20
+ def get_init_content(self, username: str) -> str:
21
+ get_url = f"https://www.picnob.com/zh-hant/profile/{username}"
22
+ res = self.send_requests(get_url)
23
+ soup = BeautifulSoup(res.text, self.__DEFAULT_SOUP_PARSER)
24
+ userid_input_element = soup.find(
25
+ "input", {"name": "userid", "type": "hidden"})
26
+
27
+ if userid_input_element:
28
+ return userid_input_element["value"], soup
29
+
30
+ return "", ""
31
+
32
+ def get_init_soup(self, profile_response):
33
+ soup = BeautifulSoup(profile_response.text, self.__DEFAULT_SOUP_PARSER)
34
+ return soup
35
+
36
+ def get_maxid(self, response):
37
+ maxid = json.loads(response.text)["posts"]["maxid"]
38
+ return maxid
39
+
40
+ def get_data(self, response):
41
+ scraped_data = json.loads(response.text)
42
+ return scraped_data
@@ -0,0 +1,53 @@
1
+ # -*- coding: utf-8 -*-
2
+ import json
3
+
4
+
5
+ class Scraper(object):
6
+ def __init__(self, pixwox_request=None, parser=None,api_parser=None):
7
+ self.pixwox_request = pixwox_request
8
+ self.parser = parser
9
+ self.api_parser = api_parser
10
+ # self.target_info = target_info
11
+
12
+ def set_username(self, username:str):
13
+ self.__username = username
14
+
15
+ @property
16
+ def username(self):
17
+ return self.__username
18
+
19
+ @property
20
+ def init_url(self) -> str:
21
+ self._init_url = f"https://www.picnob.com/zh-hant/profile/{self.username}"
22
+ return self._init_url
23
+
24
+ def get_init_api(self, userid:str) -> str:
25
+ self._init_api = f"https://www.piokok.com/api/posts?userid={userid}"
26
+ return self._init_api
27
+
28
+ def get_next_api(self, userid:str, next_maxid:str, next_:str, username:str) -> str:
29
+ next_api = f"https://www.piokok.com/api/posts?username={username}&userid={userid}&next={next_}&maxid={next_maxid}"
30
+ return next_api
31
+
32
+ def send_api(self,api_url:str):
33
+ api_response = self.pixwox_request.send_requests(url=api_url)
34
+ return api_response
35
+
36
+ def get_api_data(self, api_response):
37
+ if api_response.status_code != 200:
38
+ print("status code from init api request is not equal 200")
39
+ return None
40
+ else:
41
+ json_data = json.loads(api_response.content)
42
+ return json_data
43
+
44
+ def get_init_api_data(self,userid:str) -> dict:
45
+ init_api = self.get_init_api(userid=userid)
46
+ init_api_response = self.send_api(api_url=init_api)
47
+ return self.get_api_data(api_response=init_api_response)
48
+
49
+ def get_next_api_data(self, userid:str, next_maxid:str,next_:str, username:str):
50
+ next_api = self.get_next_api(userid=userid, next_maxid=next_maxid,next_=next_, username=username)
51
+ api_response = self.send_api(api_url=next_api)
52
+ next_api_data = self.get_api_data(api_response)
53
+ return next_api_data
@@ -0,0 +1,71 @@
1
+ # -*- coding: utf-8 -*-
2
+ import concurrent.futures as futures
3
+ from datetime import datetime
4
+ import pytz
5
+ import pandas as pd
6
+ from functools import wraps
7
+ import time
8
+
9
+
10
+ def timeit(func):
11
+ @wraps(func)
12
+ def timeit_wrapper(*args, **kwargs):
13
+ start_time = time.perf_counter()
14
+ result = func(*args, **kwargs)
15
+ end_time = time.perf_counter()
16
+ total_time = end_time - start_time
17
+ print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
18
+ return result
19
+ return timeit_wrapper
20
+
21
+ def timeout(timelimit):
22
+ def decorator(func):
23
+ def decorated(*args, **kwargs):
24
+ with futures.ThreadPoolExecutor(max_workers=1) as executor:
25
+ future = executor.submit(func, *args, **kwargs)
26
+ try:
27
+ result = future.result(timelimit)
28
+ except futures.TimeoutError:
29
+ print('Time out!')
30
+ raise TimeoutError from None
31
+ else:
32
+ print(result)
33
+ executor._threads.clear()
34
+ futures.thread._threads_queues.clear()
35
+ return result
36
+ return decorated
37
+ return decorator
38
+
39
+ def get_current_time(timezone="Asia/Taipei"):
40
+ current_time_utc = datetime.utcnow()
41
+ target_timezone = pytz.timezone(timezone)
42
+ target_current_time = current_time_utc.replace(
43
+ tzinfo=pytz.utc).astimezone(target_timezone)
44
+ return target_current_time
45
+
46
+ def get_account_status(userid, profile_soup=None):
47
+ if userid == "":
48
+ return "missing"
49
+ else:
50
+ private_span = profile_soup.find(
51
+ "span", class_="ident private icon icon_lock")
52
+ if private_span:
53
+ return "private"
54
+ return "public"
55
+
56
+ def has_all_data_been_collected(scraped_items:pd.DataFrame,counts_of_posts):
57
+ """Whether program get all posts already."""
58
+ if len(set([each["shortcode"] for each in scraped_items])) >= int(counts_of_posts):
59
+ return True
60
+ return False
61
+
62
+ def is_date_exceed_half_year(scraped_items:pd.DataFrame, days_limit:int):
63
+ """Check if scraped posts' published date exceed half year"""
64
+ current_time = datetime.now()
65
+ days_ago_list = [int(
66
+ (current_time - pd.to_datetime(each["time"], unit="s")).days) for each in scraped_items]
67
+
68
+ max_days_ago = max(days_ago_list) # 爬到的貼文裡, 發文時間距離當前時間最遠的日期
69
+ if max_days_ago > days_limit: # 半年內
70
+ return True
71
+ return False
@@ -0,0 +1,40 @@
1
+ # -*- coding: utf-8 -*-
2
+ import concurrent.futures as futures
3
+ from datetime import datetime
4
+ import pytz
5
+
6
+
7
+ def timeout(timelimit):
8
+ def decorator(func):
9
+ def decorated(*args, **kwargs):
10
+ with futures.ThreadPoolExecutor(max_workers=1) as executor:
11
+ future = executor.submit(func, *args, **kwargs)
12
+ try:
13
+ result = future.result(timelimit)
14
+ except futures.TimeoutError:
15
+ print('Time out!')
16
+ raise TimeoutError from None
17
+ else:
18
+ print(result)
19
+ executor._threads.clear()
20
+ futures.thread._threads_queues.clear()
21
+ return result
22
+ return decorated
23
+ return decorator
24
+
25
+ def get_current_time(timezone="Asia/Taipei"):
26
+ current_time_utc = datetime.utcnow()
27
+ target_timezone = pytz.timezone(timezone)
28
+ target_current_time = current_time_utc.replace(
29
+ tzinfo=pytz.utc).astimezone(target_timezone)
30
+ return target_current_time
31
+
32
+ def get_account_status(userid, profile_soup=None):
33
+ if userid == "":
34
+ return "missing"
35
+ else:
36
+ private_span = profile_soup.find(
37
+ "span", class_="ident private icon icon_lock")
38
+ if private_span:
39
+ return "private"
40
+ return "public"
@@ -0,0 +1,104 @@
1
+ Metadata-Version: 2.2
2
+ Name: instagram-posts-scraper
3
+ Version: 0.0.1
4
+ Summary: Implement Instagram Posts Scraper for post data retrieval
5
+ Home-page: https://github.com/FaustRen/instagram_posts_scraper
6
+ Author: FaustRen
7
+ Author-email: faustren1z@gmail.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.11
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Dynamic: author
16
+ Dynamic: author-email
17
+ Dynamic: classifier
18
+ Dynamic: description
19
+ Dynamic: description-content-type
20
+ Dynamic: home-page
21
+ Dynamic: license
22
+ Dynamic: requires-python
23
+ Dynamic: summary
24
+
25
+ # Instagram Posts Scraper
26
+
27
+ InstagramPostsScraper is a Python library for collect instagram users' data.
28
+
29
+ The data obtained by web crawlers is not real-time data, but rather data from a specific point in time on the same day.
30
+
31
+ ## Installation
32
+
33
+ Use the package manager [pip](https://pip.pypa.io/en/stable/) to install instagram_posts_scraper.
34
+
35
+ ```bash
36
+ beautifulsoup4==4.13.3
37
+ cloudscraper==1.2.71
38
+ pandas==2.2.3
39
+ pytz==2025.1
40
+ ```
41
+
42
+
43
+ ## Usage - Sample
44
+
45
+ ```python
46
+ from instagram_posts_scraper import InstaPeriodScraper
47
+
48
+
49
+ target_info = {"username": "joemanweng", "days_limit":5}
50
+ ig_posts_scraper = InstaPeriodScraper()
51
+ res = ig_posts_scraper.get_posts(target_info=target_info)
52
+
53
+ ```
54
+
55
+ ## Result Sample
56
+
57
+ ```
58
+ {
59
+ 'profile': {
60
+ 'introduction': ['台灣Youtuber\n⬇️專屬團購連結⬇️'],
61
+ 'counts_of_posts': '1392',
62
+ 'followers': '581198',
63
+ 'followings': '859'
64
+ },
65
+ 'account_status': 'public',
66
+ 'updated_at': datetime.datetime(2025, 2, 9, 1, 28, 8, 793770, tzinfo=<DstTzInfo 'Asia/Taipei' CST+8:00:00 STD>),
67
+ 'data': [
68
+ {
69
+ 'type': 'igtv',
70
+ 'sum': '《靈能的挑戰》參賽者後台花絮露出👀\n老師們比賽中較勁,但私下娛樂竟是互相算命!\n不知道有沒有先算出冠軍了(吃瓜)\n預祝大家新年快樂~~\n*節目中個人言論不代表本節目立場*\n﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏\n#靈能的挑戰\xa0#Joeman\xa0#東森超視33頻道\n全台第一檔靈能競賽節目\n📺️ 全季觀看平台|\n🎥 Joeman YT 頻道觀看全季\n🎥 東森超視 33 頻道已播出完畢\n﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋\n#玄學 #易經 #塔羅 #通靈 #占星',
71
+ 'sum_pure': '《靈能的挑戰》參賽者後台花絮露出 老師們比賽中較勁,但私下娛樂竟是互相算命!不知道有沒有先算出冠軍了(吃瓜)預祝大家新年快樂~~*節目中個人言論不代表本節目立場*﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏﹏#靈能的挑戰\xa0#Joeman\xa0#東森超視33頻道全台第一檔靈能競賽節目 ️ 全季觀看平台| Joeman YT 頻道觀看全季 東森超視 33 頻道已播出完畢﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋﹋#玄學 #易經 #塔羅 #通靈 #占星',
72
+ 'shortcode': '6741582137415658464423',
73
+ 'time': 1737885600,
74
+ 'ftime': '13 days ago',
75
+ 'count_like': 2442,
76
+ 'count_comment': 6,
77
+ 'count_like_pure': '2,442',
78
+ 'count_comment_pure': '6',
79
+ 'thum': 'https://scontent-fra3-1.cdninstagram.com/v/t51.2885-15/474907887_1129543381888335_8766424988966118915_n.jpg?stp=c0.248.640.640a_dst-jpg_e15_tt6&_nc_ht=scontent-fra3-1.cdninstagram.com&_nc_cat=103&_nc_ohc=YPiycXP_teMQ7kNvgF-DLaF&_nc_gid=3e3699c22b4246b98fb450d120d9527f&edm=APU89FABAAAA&ccb=7-5&oh=00_AYAb75YGceozAPWzbq4CQQGd_q_XyfKdjdwcKYak3J-pHA&oe=67AD07FF&_nc_sid=bc0c2c',
80
+ 'pic': 'https://scontent-fra3-1.cdninstagram.com/v/t51.2885-15/474907887_1129543381888335_8766424988966118915_n.jpg?stp=dst-jpg_e15_tt6&_nc_ht=scontent-fra3-1.cdninstagram.com&_nc_cat=103&_nc_ohc=YPiycXP_teMQ7kNvgF-DLaF&_nc_gid=3e3699c22b4246b98fb450d120d9527f&edm=APU89FABAAAA&ccb=7-5&oh=00_AYA0qczIKFhJEbukWZRRSUvJJ28iqBuhIXZqRB3EEA450Q&oe=67AD07FF&_nc_sid=bc0c2c',
81
+ 'pic_p': 'https://sp1.piokok.com/p/pt_6741582137415658464423_0_233bd378e8b9a4f088f6c850099951c7.jpg?u=https%3A%2F%2Fscontent-fra3-1.cdninstagram.com%2Fv%2Ft51.2885-15%2F474907887_1129543381888335_8766424988966118915_n.jpg%3Fstp%3Dc0.248.640.640a_dst-jpg_e15_tt6%26_nc_ht%3Dscontent-fra3-1.cdninstagram.com%26_nc_cat%3D103%26_nc_ohc%3DYPiycXP_teMQ7kNvgF-DLaF%26_nc_gid%3D3e3699c22b4246b98fb450d120d9527f%26edm%3DAPU89FABAAAA%26ccb%3D7-5%26oh%3D00_AYAb75YGceozAPWzbq4CQQGd_q_XyfKdjdwcKYak3J-pHA%26oe%3D67AD07FF%26_nc_sid%3Dbc0c2c',
82
+ 'down_pic': 'https://scontent-fra3-1.cdninstagram.com/v/t51.2885-15/474907887_1129543381888335_8766424988966118915_n.jpg?stp=dst-jpg_e15_tt6&_nc_ht=scontent-fra3-1.cdninstagram.com&_nc_cat=103&_nc_ohc=YPiycXP_teMQ7kNvgF-DLaF&_nc_gid=3e3699c22b4246b98fb450d120d9527f&edm=APU89FABAAAA&ccb=7-5&oh=00_AYA0qczIKFhJEbukWZRRSUvJJ28iqBuhIXZqRB3EEA450Q&oe=67AD07FF&_nc_sid=bc0c2c&dl=1',
83
+ 'is_video': True,
84
+ 'video': 'https://scontent-fra3-1.cdninstagram.com/o1/v/t16/f2/m86/AQNbkAdWefxU8OJ015A7RX0oUxLJS-03KKFEbd2ueAbuWXS5jla4AsgRPhGeuuD9HsvVUz0mG5uKwOTWpYhvCVbbIPtxarY3vAIwG7A.mp4?stp=dst-mp4&efg=eyJxZV9ncm91cHMiOiJbXCJpZ193ZWJfZGVsaXZlcnlfdnRzX290ZlwiXSIsInZlbmNvZGVfdGFnIjoidnRzX3ZvZF91cmxnZW4uY2xpcHMuYzIuNzIwLmJhc2VsaW5lIn0&_nc_cat=103&vs=427033163733715_1223826287&_nc_vs=HBksFQIYUmlnX3hwdl9yZWVsc19wZXJtYW5lbnRfc3JfcHJvZC9ERDQ1NERCQUU1RkU4RDA5MzA5NzA2Q0IzODgxRUVBM192aWRlb19kYXNoaW5pdC5tcDQVAALIAQAVAhg6cGFzc3Rocm91Z2hfZXZlcnN0b3JlL0dPcGZUeHgyY1EtV3l3c0VBQjRRdldDeTRZdHFicV9FQUFBRhUCAsgBACgAGAAbABUAACamtPfZ0u%2BTQBUCKAJDMywXQF2QEGJN0vIYEmRhc2hfYmFzZWxpbmVfMV92MREAdf4HAA%3D%3D&_nc_rid=3e36937551&ccb=9-4&oh=00_AYANPnQ7Zlk5Z_6wS0OF1Hn3ttS8vsk-0K6OyM9y8xzM6A&oe=67A925DC&_nc_sid=bc0c2c',
85
+ 'down_video': 'same_as_above'
86
+ }
87
+ ]
88
+ }
89
+
90
+
91
+
92
+
93
+ ## Contributing - Sample
94
+
95
+ comming soon..
96
+
97
+ ## License - Sample
98
+
99
+ comming soon..
100
+
101
+ ## Tests - cd to tests folder
102
+ coverage run test_crawler.py
103
+
104
+ coverage html
@@ -0,0 +1,16 @@
1
+ LICENSE
2
+ setup.py
3
+ instagram_posts_scraper/__init__.py
4
+ instagram_posts_scraper/example.py
5
+ instagram_posts_scraper/file_operation.py
6
+ instagram_posts_scraper/instagram_posts_scraper.py
7
+ instagram_posts_scraper/parse.py
8
+ instagram_posts_scraper/request.py
9
+ instagram_posts_scraper/scraper.py
10
+ instagram_posts_scraper/utils.py
11
+ instagram_posts_scraper.egg-info/PKG-INFO
12
+ instagram_posts_scraper.egg-info/SOURCES.txt
13
+ instagram_posts_scraper.egg-info/dependency_links.txt
14
+ instagram_posts_scraper.egg-info/top_level.txt
15
+ instagram_posts_scraper/utils/__init__.py
16
+ instagram_posts_scraper/utils/utils.py
@@ -0,0 +1 @@
1
+ instagram_posts_scraper
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,25 @@
1
+ # -*- coding: utf-8 -*-
2
+ import setuptools
3
+ from setuptools import setup, find_packages
4
+
5
+ setup(
6
+ name='instagram-posts-scraper',
7
+ version='0.0.1',
8
+ packages=[
9
+ "instagram_posts_scraper",
10
+ "instagram_posts_scraper.utils"
11
+ ],
12
+ license='MIT',
13
+ description='Implement Instagram Posts Scraper for post data retrieval',
14
+ long_description=open('README.md').read(),
15
+ long_description_content_type='text/markdown',
16
+ author='FaustRen',
17
+ author_email='faustren1z@gmail.com',
18
+ url='https://github.com/FaustRen/instagram_posts_scraper',
19
+ classifiers=[
20
+ "Programming Language :: Python :: 3.11",
21
+ "License :: OSI Approved :: MIT License",
22
+ "Operating System :: OS Independent",
23
+ ],
24
+ python_requires='>=3.11',
25
+ )