instagram-scraper 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/instagram_scraper.rb +125 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 313a97bddadb02842da00028b2574c6ce1f54e4258d5ee13ff89f6d83d245341
|
4
|
+
data.tar.gz: 89b5e5e19582a42fde02825244e4c2a002b404f622e48297d67c92060b75a2ad
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9e2d57594f0f09204c7ebdb2e86b197e0265d1c785a0a28d557f3dc8331e7ec07c1a053d3ff51a8e5565419fd22e31db999a1091d914a6768c02e45d14643168
|
7
|
+
data.tar.gz: 411953eb8d6360141b69688029c7a6e9bdc2cef7149789183787a42a3133543cf7f5f923dd3a9b16b36e63023428603d77ff7c751a90adf44cfad20776b7945a
|
@@ -0,0 +1,125 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "mechanize"
|
3
|
+
|
4
|
+
BASE_URL = "https://www.instagram.com".freeze
|
5
|
+
QUERY_ID_PATTERN = /[^\)]\.pagination},queryId:"([\w\d]{32})"/.freeze
|
6
|
+
|
7
|
+
class InstagramScraper
|
8
|
+
def initialize(brands, options = {})
|
9
|
+
@brands = brands
|
10
|
+
@min_likes = options[:min_likes] || 500
|
11
|
+
@output_file = options[:output_file] || "./Instagram Data (#{brands.sort.join(', ')}).csv"
|
12
|
+
@proxies = options[:proxies] || []
|
13
|
+
@data = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def perform
|
17
|
+
scrape_brands
|
18
|
+
store_data_in_csv unless @data.empty?
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def scrape_brands
|
24
|
+
@brands.each do |brand|
|
25
|
+
brand_data = scrape_brand_data(brand)
|
26
|
+
rescue OpenURI::HTTPError
|
27
|
+
next
|
28
|
+
else
|
29
|
+
scrape_brand_posts(brand_data)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def scrape_brand_data(brand)
|
34
|
+
brand_url = "#{BASE_URL}/#{brand}"
|
35
|
+
brand_data = JSON.parse(URI.open("#{brand_url}/?__a=1").read)["graphql"]["user"]
|
36
|
+
{
|
37
|
+
id: brand_data["id"],
|
38
|
+
brand: brand_data["full_name"],
|
39
|
+
brand_url: brand_url,
|
40
|
+
}
|
41
|
+
end
|
42
|
+
|
43
|
+
def scrape_brand_posts(brand_data, end_cursor = "")
|
44
|
+
query_hash = scrape_query_hash
|
45
|
+
while end_cursor
|
46
|
+
query_params = build_query_params(query_hash, brand_data[:id], end_cursor)
|
47
|
+
posts_data = scrape_posts_data(query_params)
|
48
|
+
end_cursor = posts_data["page_info"]["end_cursor"]
|
49
|
+
posts_data["edges"].each do |post_data|
|
50
|
+
post = parse_post_data(post_data["node"])
|
51
|
+
@data << brand_data.slice(:brand, :brand_url).merge(post) if post
|
52
|
+
end
|
53
|
+
puts("Scraped #{@data.count} posts") unless @data.empty?
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def scrape_query_hash
|
58
|
+
# TODO: scrape bundle name
|
59
|
+
bundle_url = "#{BASE_URL}/static/bundles/es6/ProfilePageContainer.js/b10d8b1b32fc.js"
|
60
|
+
URI.open(bundle_url).read.match(QUERY_ID_PATTERN)[1]
|
61
|
+
end
|
62
|
+
|
63
|
+
def build_query_params(query_hash, brand_id, end_cursor)
|
64
|
+
{
|
65
|
+
query_hash: query_hash,
|
66
|
+
variables: {
|
67
|
+
id: brand_id,
|
68
|
+
first: 50,
|
69
|
+
after: end_cursor,
|
70
|
+
}.to_json,
|
71
|
+
}
|
72
|
+
end
|
73
|
+
|
74
|
+
def scrape_posts_data(query_params, posts_data = [], proxy_index = 0)
|
75
|
+
agent = Mechanize.new
|
76
|
+
url = "#{BASE_URL}/graphql/query/?#{URI.encode_www_form(query_params)}"
|
77
|
+
while posts_data.empty?
|
78
|
+
proxy = @proxies[proxy_index]
|
79
|
+
unless proxy
|
80
|
+
puts "No more proxies available"
|
81
|
+
end
|
82
|
+
|
83
|
+
ip, port = proxy.split(":")
|
84
|
+
agent.set_proxy(ip, port.to_i)
|
85
|
+
begin
|
86
|
+
posts_data = JSON.parse(agent.get(url).body)["data"]["user"]["edge_user_to_photos_of_you"]
|
87
|
+
rescue
|
88
|
+
proxy_index += 1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
posts_data
|
92
|
+
end
|
93
|
+
|
94
|
+
def parse_post_data(post_data) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
95
|
+
publisher = post_data["owner"]["username"]
|
96
|
+
likes = post_data["edge_liked_by"]["count"]
|
97
|
+
return if likes < @min_likes
|
98
|
+
|
99
|
+
{
|
100
|
+
publisher: publisher,
|
101
|
+
publisher_url: "#{BASE_URL}/#{publisher}",
|
102
|
+
post_url: "#{BASE_URL}/p/#{post_data['shortcode']}",
|
103
|
+
likes: likes,
|
104
|
+
comments: post_data["edge_media_to_comment"]["count"],
|
105
|
+
date: Time.zone.at(post_data["taken_at_timestamp"]).strftime("%d/%m/%Y"),
|
106
|
+
caption: post_data["edge_media_to_caption"]["edges"]&.first&.[]("node")&.[]("text")&.gsub(/\n/, " "),
|
107
|
+
}
|
108
|
+
end
|
109
|
+
|
110
|
+
def store_data_in_csv
|
111
|
+
headers = @data.first.keys.map { |key| key.to_s.tr("_", " ").capitalize }
|
112
|
+
CSV.open(@output_file, "wb", write_headers: true, headers: headers) do |csv|
|
113
|
+
@data.each { |post| csv << post.values }
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
options = {
|
119
|
+
min_likes: ARGV[1],
|
120
|
+
output_file: "/Users/gabrielecanepa/Desktop/Instagram Data (#{ARGV[0].gsub(',', ', ').strip}).csv",
|
121
|
+
proxies: URI.open("https://www.proxy-list.download/api/v1/get?type=https").read.split(/\r\n/),
|
122
|
+
}
|
123
|
+
scraper = InstagramScraper.new(ARGV[0].split(","), options)
|
124
|
+
scraper.perform
|
125
|
+
`open -a Numbers '#{options[:output_file]}' 2>/dev/null`
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: instagram-scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- gabrielecanepa
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-08-02 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Scrape from an Instagram profile tagged posts and correspondant publishers.
|
14
|
+
email: contact@gabrielecanepa.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/instagram_scraper.rb
|
20
|
+
homepage: https://rubygems.org/gems/instagram-scraper
|
21
|
+
licenses:
|
22
|
+
- MIT
|
23
|
+
metadata:
|
24
|
+
source_code_uri: https://github.com/gabrielecanepa/instagram-scraper
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubygems_version: 3.0.3
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Easily scrape any Instagram profile.
|
44
|
+
test_files: []
|