facebook_profile_scraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6c4e62ff516ff41217241e4633bbc186083ad3f7
4
+ data.tar.gz: 98b762dcbe415cfe865df7384700b8cbb3dfc48d
5
+ SHA512:
6
+ metadata.gz: 56458fb203a44aecd9610030e9b5ddb86536b503f33f613f975e672c6162416a11c6e4b6b9a00ca14576fa25109cef900d70fce27684b2d406c9125c06815804
7
+ data.tar.gz: 8d18c66ad5107f7ec47de168f821571962ee3813789ad7da8f41bc99c31a3cd9ef28c87e511adeed0cb9d1c24116b4d0b717cd73d2170c8af1f93b0399c1a176
data/.gitignore ADDED
@@ -0,0 +1,26 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
23
+
24
+ .byebug_history
25
+ .env
26
+ example.rb
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+ ruby '2.3.1'
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2016 Maros Hluska
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Facebook Profile Scraper
2
+
3
+ Scrape your friends' Facebook photos.
4
+
5
+ ## Installation
6
+
7
+ ```
8
+ gem install facebook_profile_scraper
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ Create a `.env` file with your Facebook login information:
14
+
15
+ ```
16
+ FACEBOOK_EMAIL=
17
+ FACEBOOK_PASSWORD=
18
+ ```
19
+
20
+ Use the scraper like this:
21
+
22
+ ```
23
+ require 'facebook_profile_scraper'
24
+
25
+ scraper = FacebookProfileScraper::Scraper.new
26
+ scraper.scrape('facebook_userid')
27
+ ```
28
+
29
+ The local `tmp` directory will contain album photos.
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require 'bundler/gem_tasks'
2
+
3
+ begin
4
+ require 'rspec/core/rake_task'
5
+ RSpec::Core::RakeTask.new(:spec)
6
+ # Handle RSpec not being available on a production environment.
7
+ rescue LoadError
8
+ end
9
+
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'facebook_profile_scraper/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'facebook_profile_scraper'
8
+ spec.version = FacebookProfileScraper::VERSION
9
+ spec.authors = ['Maros Hluska']
10
+ spec.email = ['mhluska@gmail.com']
11
+ spec.summary = "Scrape a friend's Facebook profile page"
12
+ spec.description = "Scrape photos and other data from a friend's Facebook profile page"
13
+ spec.homepage = 'http://mhluska.com/'
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ['lib']
20
+
21
+ spec.add_dependency 'selenium-webdriver', '~> 2.53'
22
+ spec.add_dependency 'dotenv', '~> 2.1'
23
+
24
+ spec.add_development_dependency 'bundler', '~> 1.6'
25
+ spec.add_development_dependency 'rake', '~> 11.2'
26
+ spec.add_development_dependency 'rspec', '~> 3.5'
27
+ end
@@ -0,0 +1,5 @@
1
+ require 'facebook_profile_scraper/version'
2
+ require 'facebook_profile_scraper/scraper'
3
+
4
+ module FacebookProfileScraper
5
+ end
@@ -0,0 +1,141 @@
1
+ require 'capybara'
2
+ require 'capybara/dsl'
3
+ require 'capybara/poltergeist'
4
+ require 'dotenv'
5
+
6
+ Capybara.configure do |config|
7
+ config.run_server = false
8
+ # config.default_driver = :poltergeist
9
+ config.default_driver = :chrome
10
+ config.app_host = 'https://www.facebook.com'
11
+ config.ignore_hidden_elements = false
12
+ end
13
+
14
+ Capybara.register_driver :chrome do |app|
15
+ prefs = {
16
+ download: {
17
+ prompt_for_download: false,
18
+ default_directory: "#{Dir.pwd}/tmp/downloads"
19
+ }
20
+ }
21
+ Capybara::Selenium::Driver.new(app, browser: :chrome, prefs: prefs)
22
+ end
23
+
24
+ # TODO(maros): Figure out how to do this with `poltergeist` instead of
25
+ # `chromedriver`. We use the latter because `poltergeist` can't seem to
26
+ # download images.
27
+ # Capybara.register_driver :poltergeist do |app|
28
+ # Capybara::Poltergeist::Driver.new(app, phantomjs_logger: '/dev/null', js_errors: false, timeout: 60 * 10)
29
+ # end
30
+
31
+ Dotenv.load
32
+
33
+ module FacebookProfileScraper
34
+ class Scraper
35
+ include Capybara::DSL
36
+
37
+ def scrape(username)
38
+ login
39
+ scrape_photos(username)
40
+ end
41
+
42
+ private
43
+
44
+ def find_link_elems_with(href)
45
+ all('a').select { |elem| elem[:href] && elem[:href].include?(href) }
46
+ end
47
+
48
+ def find_links_with(href)
49
+ find_link_elems_with(href).map { |elem| elem[:href] }.uniq
50
+ end
51
+
52
+ def download_photo
53
+ tries = 0
54
+
55
+ # TODO(maros): Use something like `wait_for` instead.
56
+ loop do
57
+ find('.fbPhotoSnowliftDropdownButton').click
58
+ begin
59
+ # This will occur if the image can't be downloaded.
60
+ # TODO(maros): Resort to scraping the smaller res image instead.
61
+ if tries > 3
62
+ break
63
+ end
64
+
65
+ elem = find('a[data-action-type="download_photo"]', visible: true)
66
+ rescue Capybara::ElementNotFound
67
+ tries += 1
68
+ sleep 1
69
+ else
70
+ elem.click
71
+ break
72
+ end
73
+ end
74
+ end
75
+
76
+ def scrape_album(href)
77
+ visit(href)
78
+
79
+ # HACK(maros): Make the backdrop for Chrome Notifications go away. Find a
80
+ # pref for `chromedriver` to make this disabled by default.
81
+ find('._3ixn').click
82
+
83
+ photo_links = []
84
+
85
+ # Get all photos to load despite infinite scroll.
86
+ loop do
87
+ execute_script('window.scrollTo(0, document.body.scrollHeight);')
88
+ links = find_link_elems_with('/photo.php')
89
+ break if links.length - photo_links.length == 0
90
+ photo_links = links
91
+ end
92
+
93
+ # Remove the cover photo and profile photo.
94
+ photo_links.shift
95
+ photo_links.shift
96
+
97
+ if photo_links.length == 0
98
+ return
99
+ end
100
+
101
+ photo_links.first.click
102
+ photo_links.length.times do
103
+ download_photo
104
+ find('.snowliftPager.next').click
105
+ end
106
+
107
+ # Add directory name for album.
108
+ title = find('.fbPhotoAlbumTitle').text.downcase.gsub(' ', '_')
109
+
110
+ begin
111
+ File.rename("#{Dir.pwd}/tmp/downloads", "#{Dir.pwd}/tmp/#{title}")
112
+
113
+ # This will fail if no files were downloaded because the `tmp/downloads`
114
+ # directory will not exist.
115
+ rescue Errno::ENOENT
116
+ end
117
+ end
118
+
119
+ def scrape_photos(username)
120
+ visit("/#{username}/photos_albums")
121
+ find_links_with('/media/set').each { |link| scrape_album(link) }
122
+ end
123
+
124
+ def login
125
+ visit('/login')
126
+ fill_in('email', with: ENV['FACEBOOK_EMAIL'])
127
+ fill_in('pass', with: ENV['FACEBOOK_PASSWORD'])
128
+ click_button('loginbutton')
129
+
130
+ while has_css?('#approvals_code')
131
+ print 'Enter your 6-digit login code: '
132
+ fill_in('approvals_code', with: gets.chomp)
133
+ click_button('checkpointSubmitButton')
134
+ end
135
+
136
+ while has_css?('#checkpointSubmitButton')
137
+ click_button('checkpointSubmitButton')
138
+ end
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,3 @@
1
+ module FacebookProfileScraper
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,8 @@
1
+ require 'facebook_profile_scraper'
2
+
3
+ describe FacebookProfileScraper::Scraper, js: true do
4
+ it 'logs in' do
5
+ # subject.login
6
+ # expect(subject.page.find('#pagelet_welcome_box')).to be_true
7
+ end
8
+ end
@@ -0,0 +1,110 @@
1
+ require 'capybara'
2
+ require 'capybara/poltergeist'
3
+
4
+ Capybara.configure do |config|
5
+ config.default_driver = :poltergeist
6
+ end
7
+
8
+ # This file was generated by the `rspec --init` command. Conventionally, all
9
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
10
+ # The generated `.rspec` file contains `--require spec_helper` which will cause
11
+ # this file to always be loaded, without a need to explicitly require it in any
12
+ # files.
13
+ #
14
+ # Given that it is always loaded, you are encouraged to keep this file as
15
+ # light-weight as possible. Requiring heavyweight dependencies from this file
16
+ # will add to the boot time of your test suite on EVERY test run, even for an
17
+ # individual file that may not need all of that loaded. Instead, consider making
18
+ # a separate helper file that requires the additional dependencies and performs
19
+ # the additional setup, and require it from the spec files that actually need
20
+ # it.
21
+ #
22
+ # The `.rspec` file also contains a few flags that are not defaults but that
23
+ # users commonly want.
24
+ #
25
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
26
+ RSpec.configure do |config|
27
+ # rspec-expectations config goes here. You can use an alternate
28
+ # assertion/expectation library such as wrong or the stdlib/minitest
29
+ # assertions if you prefer.
30
+ config.expect_with :rspec do |expectations|
31
+ # This option will default to `true` in RSpec 4. It makes the `description`
32
+ # and `failure_message` of custom matchers include text for helper methods
33
+ # defined using `chain`, e.g.:
34
+ # be_bigger_than(2).and_smaller_than(4).description
35
+ # # => "be bigger than 2 and smaller than 4"
36
+ # ...rather than:
37
+ # # => "be bigger than 2"
38
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
39
+ end
40
+
41
+ # rspec-mocks config goes here. You can use an alternate test double
42
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
43
+ config.mock_with :rspec do |mocks|
44
+ # Prevents you from mocking or stubbing a method that does not exist on
45
+ # a real object. This is generally recommended, and will default to
46
+ # `true` in RSpec 4.
47
+ mocks.verify_partial_doubles = true
48
+ end
49
+
50
+ # This option will default to `:apply_to_host_groups` in RSpec 4 (and will
51
+ # have no way to turn it off -- the option exists only for backwards
52
+ # compatibility in RSpec 3). It causes shared context metadata to be
53
+ # inherited by the metadata hash of host groups and examples, rather than
54
+ # triggering implicit auto-inclusion in groups with matching metadata.
55
+ config.shared_context_metadata_behavior = :apply_to_host_groups
56
+
57
+ # The settings below are suggested to provide a good initial experience
58
+ # with RSpec, but feel free to customize to your heart's content.
59
+ =begin
60
+ # This allows you to limit a spec run to individual examples or groups
61
+ # you care about by tagging them with `:focus` metadata. When nothing
62
+ # is tagged with `:focus`, all examples get run. RSpec also provides
63
+ # aliases for `it`, `describe`, and `context` that include `:focus`
64
+ # metadata: `fit`, `fdescribe` and `fcontext`, respectively.
65
+ config.filter_run_when_matching :focus
66
+
67
+ # Allows RSpec to persist some state between runs in order to support
68
+ # the `--only-failures` and `--next-failure` CLI options. We recommend
69
+ # you configure your source control system to ignore this file.
70
+ config.example_status_persistence_file_path = "spec/examples.txt"
71
+
72
+ # Limits the available syntax to the non-monkey patched syntax that is
73
+ # recommended. For more details, see:
74
+ # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/
75
+ # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
76
+ # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode
77
+ config.disable_monkey_patching!
78
+
79
+ # This setting enables warnings. It's recommended, but in some cases may
80
+ # be too noisy due to issues in dependencies.
81
+ config.warnings = true
82
+
83
+ # Many RSpec users commonly either run the entire suite or an individual
84
+ # file, and it's useful to allow more verbose output when running an
85
+ # individual spec file.
86
+ if config.files_to_run.one?
87
+ # Use the documentation formatter for detailed output,
88
+ # unless a formatter has already been configured
89
+ # (e.g. via a command-line flag).
90
+ config.default_formatter = 'doc'
91
+ end
92
+
93
+ # Print the 10 slowest examples and example groups at the
94
+ # end of the spec run, to help surface which specs are running
95
+ # particularly slow.
96
+ config.profile_examples = 10
97
+
98
+ # Run specs in random order to surface order dependencies. If you find an
99
+ # order dependency and want to debug it, you can fix the order by providing
100
+ # the seed, which is printed after each run.
101
+ # --seed 1234
102
+ config.order = :random
103
+
104
+ # Seed global randomization in this process using the `--seed` CLI option.
105
+ # Setting this allows you to use `--seed` to deterministically reproduce
106
+ # test failures related to randomization by passing the same `--seed` value
107
+ # as the one that triggered the failure.
108
+ Kernel.srand config.seed
109
+ =end
110
+ end
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: facebook_profile_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Maros Hluska
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-08-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: selenium-webdriver
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.53'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.53'
27
+ - !ruby/object:Gem::Dependency
28
+ name: dotenv
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.1'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.1'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '11.2'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '11.2'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.5'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.5'
83
+ description: Scrape photos and other data from a friend's Facebook profile page
84
+ email:
85
+ - mhluska@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".gitignore"
91
+ - ".rspec"
92
+ - Gemfile
93
+ - LICENSE.txt
94
+ - README.md
95
+ - Rakefile
96
+ - facebook_profile_scraper.gemspec
97
+ - lib/facebook_profile_scraper.rb
98
+ - lib/facebook_profile_scraper/scraper.rb
99
+ - lib/facebook_profile_scraper/version.rb
100
+ - spec/scraper_spec.rb
101
+ - spec/spec_helper.rb
102
+ homepage: http://mhluska.com/
103
+ licenses:
104
+ - MIT
105
+ metadata: {}
106
+ post_install_message:
107
+ rdoc_options: []
108
+ require_paths:
109
+ - lib
110
+ required_ruby_version: !ruby/object:Gem::Requirement
111
+ requirements:
112
+ - - ">="
113
+ - !ruby/object:Gem::Version
114
+ version: '0'
115
+ required_rubygems_version: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: '0'
120
+ requirements: []
121
+ rubyforge_project:
122
+ rubygems_version: 2.6.6
123
+ signing_key:
124
+ specification_version: 4
125
+ summary: Scrape a friend's Facebook profile page
126
+ test_files:
127
+ - spec/scraper_spec.rb
128
+ - spec/spec_helper.rb