facebook_profile_scraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6c4e62ff516ff41217241e4633bbc186083ad3f7
4
+ data.tar.gz: 98b762dcbe415cfe865df7384700b8cbb3dfc48d
5
+ SHA512:
6
+ metadata.gz: 56458fb203a44aecd9610030e9b5ddb86536b503f33f613f975e672c6162416a11c6e4b6b9a00ca14576fa25109cef900d70fce27684b2d406c9125c06815804
7
+ data.tar.gz: 8d18c66ad5107f7ec47de168f821571962ee3813789ad7da8f41bc99c31a3cd9ef28c87e511adeed0cb9d1c24116b4d0b717cd73d2170c8af1f93b0399c1a176
data/.gitignore ADDED
@@ -0,0 +1,26 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
23
+
24
+ .byebug_history
25
+ .env
26
+ example.rb
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+ ruby '2.3.1'
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2016 Maros Hluska
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Facebook Profile Scraper
2
+
3
+ Scrape your friends' Facebook photos.
4
+
5
+ ## Installation
6
+
7
+ ```
8
+ gem install facebook_profile_scraper
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ Create a `.env` file with your Facebook login information:
14
+
15
+ ```
16
+ FACEBOOK_EMAIL=
17
+ FACEBOOK_PASSWORD=
18
+ ```
19
+
20
+ Use the scraper like this:
21
+
22
+ ```
23
+ require 'facebook_profile_scraper'
24
+
25
+ scraper = FacebookProfileScraper::Scraper.new
26
+ scraper.scrape('facebook_userid')
27
+ ```
28
+
29
+ The local `tmp` directory will contain album photos.
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require 'bundler/gem_tasks'
2
+
3
+ begin
4
+ require 'rspec/core/rake_task'
5
+ RSpec::Core::RakeTask.new(:spec)
6
+ # Handle RSpec not being available on a production environment.
7
+ rescue LoadError
8
+ end
9
+
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'facebook_profile_scraper/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'facebook_profile_scraper'
8
+ spec.version = FacebookProfileScraper::VERSION
9
+ spec.authors = ['Maros Hluska']
10
+ spec.email = ['mhluska@gmail.com']
11
+ spec.summary = "Scrape a friend's Facebook profile page"
12
+ spec.description = "Scrape photos and other data from a friend's Facebook profile page"
13
+ spec.homepage = 'http://mhluska.com/'
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ['lib']
20
+
21
+ spec.add_dependency 'selenium-webdriver', '~> 2.53'
22
+ spec.add_dependency 'dotenv', '~> 2.1'
23
+
24
+ spec.add_development_dependency 'bundler', '~> 1.6'
25
+ spec.add_development_dependency 'rake', '~> 11.2'
26
+ spec.add_development_dependency 'rspec', '~> 3.5'
27
+ end
@@ -0,0 +1,5 @@
1
+ require 'facebook_profile_scraper/version'
2
+ require 'facebook_profile_scraper/scraper'
3
+
4
+ module FacebookProfileScraper
5
+ end
@@ -0,0 +1,141 @@
1
+ require 'capybara'
2
+ require 'capybara/dsl'
3
+ require 'capybara/poltergeist'
4
+ require 'dotenv'
5
+
6
+ Capybara.configure do |config|
7
+ config.run_server = false
8
+ # config.default_driver = :poltergeist
9
+ config.default_driver = :chrome
10
+ config.app_host = 'https://www.facebook.com'
11
+ config.ignore_hidden_elements = false
12
+ end
13
+
14
+ Capybara.register_driver :chrome do |app|
15
+ prefs = {
16
+ download: {
17
+ prompt_for_download: false,
18
+ default_directory: "#{Dir.pwd}/tmp/downloads"
19
+ }
20
+ }
21
+ Capybara::Selenium::Driver.new(app, browser: :chrome, prefs: prefs)
22
+ end
23
+
24
+ # TODO(maros): Figure out how to do this with `poltergeist` instead of
25
+ # `chromedriver`. We use the latter because `poltergeist` can't seem to
26
+ # download images.
27
+ # Capybara.register_driver :poltergeist do |app|
28
+ # Capybara::Poltergeist::Driver.new(app, phantomjs_logger: '/dev/null', js_errors: false, timeout: 60 * 10)
29
+ # end
30
+
31
+ Dotenv.load
32
+
33
+ module FacebookProfileScraper
34
+ class Scraper
35
+ include Capybara::DSL
36
+
37
+ def scrape(username)
38
+ login
39
+ scrape_photos(username)
40
+ end
41
+
42
+ private
43
+
44
+ def find_link_elems_with(href)
45
+ all('a').select { |elem| elem[:href] && elem[:href].include?(href) }
46
+ end
47
+
48
+ def find_links_with(href)
49
+ find_link_elems_with(href).map { |elem| elem[:href] }.uniq
50
+ end
51
+
52
+ def download_photo
53
+ tries = 0
54
+
55
+ # TODO(maros): Use something like `wait_for` instead.
56
+ loop do
57
+ find('.fbPhotoSnowliftDropdownButton').click
58
+ begin
59
+ # This will occur if the image can't be downloaded.
60
+ # TODO(maros): Resort to scraping the smaller res image instead.
61
+ if tries > 3
62
+ break
63
+ end
64
+
65
+ elem = find('a[data-action-type="download_photo"]', visible: true)
66
+ rescue Capybara::ElementNotFound
67
+ tries += 1
68
+ sleep 1
69
+ else
70
+ elem.click
71
+ break
72
+ end
73
+ end
74
+ end
75
+
76
+ def scrape_album(href)
77
+ visit(href)
78
+
79
+ # HACK(maros): Make the backdrop for Chrome Notifications go away. Find a
80
+ # pref for `chromedriver` to make this disabled by default.
81
+ find('._3ixn').click
82
+
83
+ photo_links = []
84
+
85
+ # Get all photos to load despite infinite scroll.
86
+ loop do
87
+ execute_script('window.scrollTo(0, document.body.scrollHeight);')
88
+ links = find_link_elems_with('/photo.php')
89
+ break if links.length - photo_links.length == 0
90
+ photo_links = links
91
+ end
92
+
93
+ # Remove the cover photo and profile photo.
94
+ photo_links.shift
95
+ photo_links.shift
96
+
97
+ if photo_links.length == 0
98
+ return
99
+ end
100
+
101
+ photo_links.first.click
102
+ photo_links.length.times do
103
+ download_photo
104
+ find('.snowliftPager.next').click
105
+ end
106
+
107
+ # Add directory name for album.
108
+ title = find('.fbPhotoAlbumTitle').text.downcase.gsub(' ', '_')
109
+
110
+ begin
111
+ File.rename("#{Dir.pwd}/tmp/downloads", "#{Dir.pwd}/tmp/#{title}")
112
+
113
+ # This will fail if no files were downloaded because the `tmp/downloads`
114
+ # directory will not exist.
115
+ rescue Errno::ENOENT
116
+ end
117
+ end
118
+
119
+ def scrape_photos(username)
120
+ visit("/#{username}/photos_albums")
121
+ find_links_with('/media/set').each { |link| scrape_album(link) }
122
+ end
123
+
124
+ def login
125
+ visit('/login')
126
+ fill_in('email', with: ENV['FACEBOOK_EMAIL'])
127
+ fill_in('pass', with: ENV['FACEBOOK_PASSWORD'])
128
+ click_button('loginbutton')
129
+
130
+ while has_css?('#approvals_code')
131
+ print 'Enter your 6-digit login code: '
132
+ fill_in('approvals_code', with: gets.chomp)
133
+ click_button('checkpointSubmitButton')
134
+ end
135
+
136
+ while has_css?('#checkpointSubmitButton')
137
+ click_button('checkpointSubmitButton')
138
+ end
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,3 @@
1
+ module FacebookProfileScraper
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,8 @@
1
+ require 'facebook_profile_scraper'
2
+
3
+ describe FacebookProfileScraper::Scraper, js: true do
4
+ it 'logs in' do
5
+ # subject.login
6
+ # expect(subject.page.find('#pagelet_welcome_box')).to be_true
7
+ end
8
+ end
@@ -0,0 +1,110 @@
1
+ require 'capybara'
2
+ require 'capybara/poltergeist'
3
+
4
+ Capybara.configure do |config|
5
+ config.default_driver = :poltergeist
6
+ end
7
+
8
+ # This file was generated by the `rspec --init` command. Conventionally, all
9
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
10
+ # The generated `.rspec` file contains `--require spec_helper` which will cause
11
+ # this file to always be loaded, without a need to explicitly require it in any
12
+ # files.
13
+ #
14
+ # Given that it is always loaded, you are encouraged to keep this file as
15
+ # light-weight as possible. Requiring heavyweight dependencies from this file
16
+ # will add to the boot time of your test suite on EVERY test run, even for an
17
+ # individual file that may not need all of that loaded. Instead, consider making
18
+ # a separate helper file that requires the additional dependencies and performs
19
+ # the additional setup, and require it from the spec files that actually need
20
+ # it.
21
+ #
22
+ # The `.rspec` file also contains a few flags that are not defaults but that
23
+ # users commonly want.
24
+ #
25
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
26
+ RSpec.configure do |config|
27
+ # rspec-expectations config goes here. You can use an alternate
28
+ # assertion/expectation library such as wrong or the stdlib/minitest
29
+ # assertions if you prefer.
30
+ config.expect_with :rspec do |expectations|
31
+ # This option will default to `true` in RSpec 4. It makes the `description`
32
+ # and `failure_message` of custom matchers include text for helper methods
33
+ # defined using `chain`, e.g.:
34
+ # be_bigger_than(2).and_smaller_than(4).description
35
+ # # => "be bigger than 2 and smaller than 4"
36
+ # ...rather than:
37
+ # # => "be bigger than 2"
38
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
39
+ end
40
+
41
+ # rspec-mocks config goes here. You can use an alternate test double
42
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
43
+ config.mock_with :rspec do |mocks|
44
+ # Prevents you from mocking or stubbing a method that does not exist on
45
+ # a real object. This is generally recommended, and will default to
46
+ # `true` in RSpec 4.
47
+ mocks.verify_partial_doubles = true
48
+ end
49
+
50
+ # This option will default to `:apply_to_host_groups` in RSpec 4 (and will
51
+ # have no way to turn it off -- the option exists only for backwards
52
+ # compatibility in RSpec 3). It causes shared context metadata to be
53
+ # inherited by the metadata hash of host groups and examples, rather than
54
+ # triggering implicit auto-inclusion in groups with matching metadata.
55
+ config.shared_context_metadata_behavior = :apply_to_host_groups
56
+
57
+ # The settings below are suggested to provide a good initial experience
58
+ # with RSpec, but feel free to customize to your heart's content.
59
+ =begin
60
+ # This allows you to limit a spec run to individual examples or groups
61
+ # you care about by tagging them with `:focus` metadata. When nothing
62
+ # is tagged with `:focus`, all examples get run. RSpec also provides
63
+ # aliases for `it`, `describe`, and `context` that include `:focus`
64
+ # metadata: `fit`, `fdescribe` and `fcontext`, respectively.
65
+ config.filter_run_when_matching :focus
66
+
67
+ # Allows RSpec to persist some state between runs in order to support
68
+ # the `--only-failures` and `--next-failure` CLI options. We recommend
69
+ # you configure your source control system to ignore this file.
70
+ config.example_status_persistence_file_path = "spec/examples.txt"
71
+
72
+ # Limits the available syntax to the non-monkey patched syntax that is
73
+ # recommended. For more details, see:
74
+ # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/
75
+ # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
76
+ # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode
77
+ config.disable_monkey_patching!
78
+
79
+ # This setting enables warnings. It's recommended, but in some cases may
80
+ # be too noisy due to issues in dependencies.
81
+ config.warnings = true
82
+
83
+ # Many RSpec users commonly either run the entire suite or an individual
84
+ # file, and it's useful to allow more verbose output when running an
85
+ # individual spec file.
86
+ if config.files_to_run.one?
87
+ # Use the documentation formatter for detailed output,
88
+ # unless a formatter has already been configured
89
+ # (e.g. via a command-line flag).
90
+ config.default_formatter = 'doc'
91
+ end
92
+
93
+ # Print the 10 slowest examples and example groups at the
94
+ # end of the spec run, to help surface which specs are running
95
+ # particularly slow.
96
+ config.profile_examples = 10
97
+
98
+ # Run specs in random order to surface order dependencies. If you find an
99
+ # order dependency and want to debug it, you can fix the order by providing
100
+ # the seed, which is printed after each run.
101
+ # --seed 1234
102
+ config.order = :random
103
+
104
+ # Seed global randomization in this process using the `--seed` CLI option.
105
+ # Setting this allows you to use `--seed` to deterministically reproduce
106
+ # test failures related to randomization by passing the same `--seed` value
107
+ # as the one that triggered the failure.
108
+ Kernel.srand config.seed
109
+ =end
110
+ end
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: facebook_profile_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Maros Hluska
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-08-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: selenium-webdriver
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.53'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.53'
27
+ - !ruby/object:Gem::Dependency
28
+ name: dotenv
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.1'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.1'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '11.2'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '11.2'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.5'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.5'
83
+ description: Scrape photos and other data from a friend's Facebook profile page
84
+ email:
85
+ - mhluska@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".gitignore"
91
+ - ".rspec"
92
+ - Gemfile
93
+ - LICENSE.txt
94
+ - README.md
95
+ - Rakefile
96
+ - facebook_profile_scraper.gemspec
97
+ - lib/facebook_profile_scraper.rb
98
+ - lib/facebook_profile_scraper/scraper.rb
99
+ - lib/facebook_profile_scraper/version.rb
100
+ - spec/scraper_spec.rb
101
+ - spec/spec_helper.rb
102
+ homepage: http://mhluska.com/
103
+ licenses:
104
+ - MIT
105
+ metadata: {}
106
+ post_install_message:
107
+ rdoc_options: []
108
+ require_paths:
109
+ - lib
110
+ required_ruby_version: !ruby/object:Gem::Requirement
111
+ requirements:
112
+ - - ">="
113
+ - !ruby/object:Gem::Version
114
+ version: '0'
115
+ required_rubygems_version: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: '0'
120
+ requirements: []
121
+ rubyforge_project:
122
+ rubygems_version: 2.6.6
123
+ signing_key:
124
+ specification_version: 4
125
+ summary: Scrape a friend's Facebook profile page
126
+ test_files:
127
+ - spec/scraper_spec.rb
128
+ - spec/spec_helper.rb