instagram-scraper 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/instagram_scraper.rb +125 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 313a97bddadb02842da00028b2574c6ce1f54e4258d5ee13ff89f6d83d245341
|
4
|
+
data.tar.gz: 89b5e5e19582a42fde02825244e4c2a002b404f622e48297d67c92060b75a2ad
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9e2d57594f0f09204c7ebdb2e86b197e0265d1c785a0a28d557f3dc8331e7ec07c1a053d3ff51a8e5565419fd22e31db999a1091d914a6768c02e45d14643168
|
7
|
+
data.tar.gz: 411953eb8d6360141b69688029c7a6e9bdc2cef7149789183787a42a3133543cf7f5f923dd3a9b16b36e63023428603d77ff7c751a90adf44cfad20776b7945a
|
@@ -0,0 +1,125 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "mechanize"
|
3
|
+
|
4
|
+
BASE_URL = "https://www.instagram.com".freeze
|
5
|
+
QUERY_ID_PATTERN = /[^\)]\.pagination},queryId:"([\w\d]{32})"/.freeze
|
6
|
+
|
7
|
+
class InstagramScraper
|
8
|
+
def initialize(brands, options = {})
|
9
|
+
@brands = brands
|
10
|
+
@min_likes = options[:min_likes] || 500
|
11
|
+
@output_file = options[:output_file] || "./Instagram Data (#{brands.sort.join(', ')}).csv"
|
12
|
+
@proxies = options[:proxies] || []
|
13
|
+
@data = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def perform
|
17
|
+
scrape_brands
|
18
|
+
store_data_in_csv unless @data.empty?
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def scrape_brands
|
24
|
+
@brands.each do |brand|
|
25
|
+
brand_data = scrape_brand_data(brand)
|
26
|
+
rescue OpenURI::HTTPError
|
27
|
+
next
|
28
|
+
else
|
29
|
+
scrape_brand_posts(brand_data)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def scrape_brand_data(brand)
|
34
|
+
brand_url = "#{BASE_URL}/#{brand}"
|
35
|
+
brand_data = JSON.parse(URI.open("#{brand_url}/?__a=1").read)["graphql"]["user"]
|
36
|
+
{
|
37
|
+
id: brand_data["id"],
|
38
|
+
brand: brand_data["full_name"],
|
39
|
+
brand_url: brand_url,
|
40
|
+
}
|
41
|
+
end
|
42
|
+
|
43
|
+
def scrape_brand_posts(brand_data, end_cursor = "")
|
44
|
+
query_hash = scrape_query_hash
|
45
|
+
while end_cursor
|
46
|
+
query_params = build_query_params(query_hash, brand_data[:id], end_cursor)
|
47
|
+
posts_data = scrape_posts_data(query_params)
|
48
|
+
end_cursor = posts_data["page_info"]["end_cursor"]
|
49
|
+
posts_data["edges"].each do |post_data|
|
50
|
+
post = parse_post_data(post_data["node"])
|
51
|
+
@data << brand_data.slice(:brand, :brand_url).merge(post) if post
|
52
|
+
end
|
53
|
+
puts("Scraped #{@data.count} posts") unless @data.empty?
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def scrape_query_hash
|
58
|
+
# TODO: scrape bundle name
|
59
|
+
bundle_url = "#{BASE_URL}/static/bundles/es6/ProfilePageContainer.js/b10d8b1b32fc.js"
|
60
|
+
URI.open(bundle_url).read.match(QUERY_ID_PATTERN)[1]
|
61
|
+
end
|
62
|
+
|
63
|
+
def build_query_params(query_hash, brand_id, end_cursor)
|
64
|
+
{
|
65
|
+
query_hash: query_hash,
|
66
|
+
variables: {
|
67
|
+
id: brand_id,
|
68
|
+
first: 50,
|
69
|
+
after: end_cursor,
|
70
|
+
}.to_json,
|
71
|
+
}
|
72
|
+
end
|
73
|
+
|
74
|
+
def scrape_posts_data(query_params, posts_data = [], proxy_index = 0)
|
75
|
+
agent = Mechanize.new
|
76
|
+
url = "#{BASE_URL}/graphql/query/?#{URI.encode_www_form(query_params)}"
|
77
|
+
while posts_data.empty?
|
78
|
+
proxy = @proxies[proxy_index]
|
79
|
+
unless proxy
|
80
|
+
puts "No more proxies available"
|
81
|
+
end
|
82
|
+
|
83
|
+
ip, port = proxy.split(":")
|
84
|
+
agent.set_proxy(ip, port.to_i)
|
85
|
+
begin
|
86
|
+
posts_data = JSON.parse(agent.get(url).body)["data"]["user"]["edge_user_to_photos_of_you"]
|
87
|
+
rescue
|
88
|
+
proxy_index += 1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
posts_data
|
92
|
+
end
|
93
|
+
|
94
|
+
def parse_post_data(post_data) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
95
|
+
publisher = post_data["owner"]["username"]
|
96
|
+
likes = post_data["edge_liked_by"]["count"]
|
97
|
+
return if likes < @min_likes
|
98
|
+
|
99
|
+
{
|
100
|
+
publisher: publisher,
|
101
|
+
publisher_url: "#{BASE_URL}/#{publisher}",
|
102
|
+
post_url: "#{BASE_URL}/p/#{post_data['shortcode']}",
|
103
|
+
likes: likes,
|
104
|
+
comments: post_data["edge_media_to_comment"]["count"],
|
105
|
+
date: Time.zone.at(post_data["taken_at_timestamp"]).strftime("%d/%m/%Y"),
|
106
|
+
caption: post_data["edge_media_to_caption"]["edges"]&.first&.[]("node")&.[]("text")&.gsub(/\n/, " "),
|
107
|
+
}
|
108
|
+
end
|
109
|
+
|
110
|
+
def store_data_in_csv
|
111
|
+
headers = @data.first.keys.map { |key| key.to_s.tr("_", " ").capitalize }
|
112
|
+
CSV.open(@output_file, "wb", write_headers: true, headers: headers) do |csv|
|
113
|
+
@data.each { |post| csv << post.values }
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
options = {
|
119
|
+
min_likes: ARGV[1],
|
120
|
+
output_file: "/Users/gabrielecanepa/Desktop/Instagram Data (#{ARGV[0].gsub(',', ', ').strip}).csv",
|
121
|
+
proxies: URI.open("https://www.proxy-list.download/api/v1/get?type=https").read.split(/\r\n/),
|
122
|
+
}
|
123
|
+
scraper = InstagramScraper.new(ARGV[0].split(","), options)
|
124
|
+
scraper.perform
|
125
|
+
`open -a Numbers '#{options[:output_file]}' 2>/dev/null`
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: instagram-scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- gabrielecanepa
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-08-02 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Scrape from an Instagram profile tagged posts and correspondant publishers.
|
14
|
+
email: contact@gabrielecanepa.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/instagram_scraper.rb
|
20
|
+
homepage: https://rubygems.org/gems/instagram-scraper
|
21
|
+
licenses:
|
22
|
+
- MIT
|
23
|
+
metadata:
|
24
|
+
source_code_uri: https://github.com/gabrielecanepa/instagram-scraper
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubygems_version: 3.0.3
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Easily scrape any Instagram profile.
|
44
|
+
test_files: []
|