gumtree_scraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 42e2924d09e2b7f023235c3a1c0a824bb4af9f4b
4
+ data.tar.gz: aa76be8fe4d7974a0149ae08eceb92ec52ecebef
5
+ SHA512:
6
+ metadata.gz: 4f07747c6cb026f0028f802198cd2916a9c5ab1c1c6ff4364bc064a198aa6df7b71e8d3a21afa42f356e0714d672098428615f469207735900a9db02e3cf2fc0
7
+ data.tar.gz: 0fba809d91d3afbdfe2decc0dce1efa450ad9f019c5f7a3f4e618c588e9dc85ff0d722d84e343b4a67d6a5afb2cb6df076c1561c919fc25549c66d9b67092a59
data/.gitignore ADDED
@@ -0,0 +1,34 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /test/tmp/
9
+ /test/version_tmp/
10
+ /tmp/
11
+
12
+ ## Specific to RubyMotion:
13
+ .dat*
14
+ .repl_history
15
+ build/
16
+
17
+ ## Documentation cache and generated files:
18
+ /.yardoc/
19
+ /_yardoc/
20
+ /doc/
21
+ /rdoc/
22
+
23
+ ## Environment normalisation:
24
+ /.bundle/
25
+ /lib/bundler/man/
26
+
27
+ # for a library or gem, you might want to ignore these files since the code is
28
+ # intended to run in multiple environments; otherwise, check them in:
29
+ # Gemfile.lock
30
+ # .ruby-version
31
+ # .ruby-gemset
32
+
33
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
34
+ .rvmrc
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,34 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ gumtree_scraper (0.0.1)
5
+ bundler (~> 1.5)
6
+ docparser (~> 0.2)
7
+ thor (~> 0.19)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ axlsx (2.0.1)
13
+ htmlentities (~> 4.3.1)
14
+ nokogiri (>= 1.4.1)
15
+ rubyzip (~> 1.0.0)
16
+ docparser (0.2.3)
17
+ axlsx (~> 2.0.1)
18
+ log4r (~> 1.1.10)
19
+ nokogiri (~> 1.6.1)
20
+ parallel (~> 1.3.2)
21
+ htmlentities (4.3.2)
22
+ log4r (1.1.10)
23
+ mini_portile (0.6.1)
24
+ nokogiri (1.6.4)
25
+ mini_portile (~> 0.6.0)
26
+ parallel (1.3.3)
27
+ rubyzip (1.0.0)
28
+ thor (0.19.1)
29
+
30
+ PLATFORMS
31
+ ruby
32
+
33
+ DEPENDENCIES
34
+ gumtree_scraper!
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 charles marshall
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
data/README.md ADDED
@@ -0,0 +1,20 @@
1
+ gumtree_scrape
2
+ ==============
3
+
4
+ Install with:
5
+
6
+ `gem install gumtree_scrape`
7
+
8
+ Usage:
9
+
10
+ On terminal run:
11
+
12
+ `gumtree_scrape go`
13
+
14
+ If failing and IP has been blocked, there is a proxy built in. To use that, run:
15
+
16
+ `gumtree_scrape go --proxy`
17
+
18
+ At the moment only a single proxy is setup, http://anonymouse.org
19
+
20
+ When running it will ask a series of questions, just enter one the values offered.
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
5
+
6
+ require "gum"
7
+ GUM::Gum.start(ARGV)
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "gum/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "gumtree_scraper"
8
+ spec.version = GUM::VERSION
9
+ spec.authors = ["Charles Marshall"]
10
+ spec.email = ["cm56marshall@gmail.com"]
11
+ spec.summary = %q{ }
12
+ spec.description = %q{ }
13
+ spec.homepage = "https://github.com/charlesmarshall/gumtree_scraper/tree/#{GUM::VERSION}"
14
+ spec.license = "MIT"
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+ spec.required_ruby_version = '>= 2.0'
20
+
21
+ spec.add_dependency "docparser", "~> 0.2"
22
+ spec.add_dependency "bundler", "~> 1.5"
23
+ spec.add_dependency "thor", "~> 0.19"
24
+
25
+ spec.post_install_message = ""
26
+
27
+ end
data/lib/gum/base.rb ADDED
@@ -0,0 +1,33 @@
1
+ module GUM
2
+ class Base < Thor
3
+ include Thor::Shell
4
+
5
+ DEFAULT_PROXIES = [
6
+ "http://anonymouse.org/cgi-bin/anon-www.cgi/%URL",
7
+ ]
8
+
9
+ no_commands{
10
+ def log(out)
11
+ say("[#{Time.now.to_s}] #{out}", :black)
12
+ end
13
+
14
+ # fetch url and save to a local file for parsing
15
+ def url_to_file(url, proxi=nil)
16
+ url = URI.escape(url)
17
+ file = "/tmp/gum_#{Time.now.to_i.to_s}"
18
+ if proxi
19
+ proxy = GUM::Base::DEFAULT_PROXIES.sample
20
+ log("Using proxy: #{proxy}")
21
+ url = proxy.gsub("%URL", url)
22
+ File.open(file, "wb") { |f| f.write open(url).read.gsub(proxy.gsub("%URL", ""), "") }
23
+ else
24
+ File.open(file, "wb") { |f| f.write open(url).read }
25
+ end
26
+ log("'#{url}' saved to '#{file}'")
27
+ return file
28
+ end
29
+
30
+ }
31
+
32
+ end
33
+ end
data/lib/gum/gum.rb ADDED
@@ -0,0 +1,50 @@
1
+ module GUM
2
+ class Gum < Base
3
+ class_option :verbose, :type => :boolean, :default => true
4
+ class_option :proxy, :type => :boolean, :default => true
5
+
6
+
7
+ desc "go", "Run everything"
8
+ def go
9
+
10
+ # first off, ask what we want to download
11
+ page = GUM::Pages.new(options)
12
+ # base url without the page number subbed out
13
+ url = page.url(1)
14
+ # now check if the page works
15
+ file = self.url_to_file(url, options["proxy"])
16
+ if File.exists?(file)
17
+ max_page = page.max_page(file)
18
+ start_page = ask("Enter page to start at: ", :yellow)
19
+ start_page = 1 if start_page.nil?
20
+ asked_pages = ask("Enter page to end at (found #{max_page}) : ", :yellow)
21
+ max_page = asked_pages.to_i if ! asked_pages.nil? && asked_pages != "y"
22
+ log("Using #{start_page}-#{max_page} as max page")
23
+
24
+ adverts = page.advert_links(start_page.to_i ,max_page)
25
+ say("Found #{adverts.length} adverts", :green)
26
+ # now fetch data for each link
27
+ processed = page.advert_data(adverts, :telephone)
28
+ say("Found #{processed.length} with phone numbers", :green)
29
+ # output to a csv
30
+ path = Dir.pwd+"/gumtree_p#{start_page}_p#{max_page}_#{Time.now.to_i.to_s}.csv"
31
+
32
+ CSV.open(path, "wb") do |csvfile|
33
+ processed.each do|v|
34
+ log("Saving #{v[:url]}")
35
+ csvfile << v.values if ! v.nil?
36
+ end
37
+ end
38
+ # clean up
39
+ say("Saved to : #{path}", :green)
40
+ `rm /tmp/gum_* 2>&1 /dev/null`
41
+ say("Complete", :blue)
42
+ else
43
+ say("Failed to download page", :red)
44
+ exit 1
45
+ end
46
+ end
47
+
48
+
49
+ end
50
+ end
data/lib/gum/pages.rb ADDED
@@ -0,0 +1,156 @@
1
+ module GUM
2
+ class Pages < Base
3
+ include DocParser
4
+ attr_accessor :top_level, :sub_level_values, :base_uri, :uri, :proxy
5
+ def initialize(options)
6
+ @proxy = nil
7
+ options.each do |k,v|
8
+ self.class.send(:attr_accessor, k)
9
+ self.instance_variable_set("@#{k}",v)
10
+ end
11
+ end
12
+
13
+ # gum tree page info
14
+ DOMAIN = "http://www.gumtree.com"
15
+ BASE_SEARCH = "#{GUM::Pages::DOMAIN}/search?q=&search_category=%CATEGORY&search_location=%LOCATION&distance=%DISTANCE&seller_type=%SELLER_TYPE&page=%PAGE"
16
+ LISTING_PAGES ={
17
+ "property" => {
18
+ "category" => "flatshare",
19
+ "location" => "birmingham",
20
+ "distance" => ["10", "50", "100", "500", "1000"],
21
+ "seller_type" => ["private", "trade"],
22
+ }
23
+ }
24
+
25
+ # page data
26
+ SELECTORS = {
27
+ "property" => {
28
+ "list" => {
29
+ :body => "#fullListings",
30
+ :max_page => "li.page-last a",
31
+ # anything with a link in the key name is presumed that you want the href value, not the content
32
+ :item_links => "a.listing-link"
33
+ },
34
+ "advert" => {
35
+ :body => ".main main",
36
+ :name => "h1.space-mbs",
37
+ :price => ".ad-price",
38
+ :telephone => ".truncate-number",
39
+ :description => "p.ad-description"
40
+ }
41
+ }
42
+ }
43
+
44
+ no_commands {
45
+ # select the top level versions
46
+ def top_level_selection
47
+ ask("Please select what type of search page you want to scrape:", :yellow, :limited_to =>GUM::Pages::LISTING_PAGES.keys)
48
+ end
49
+
50
+ # find the real values for each item in side the sub list
51
+ def get_values(section)
52
+ data = GUM::Pages::LISTING_PAGES[section]
53
+ values = {}
54
+ data.each do |k, options|
55
+ if options.class == Array
56
+ values[k] = ask("Enter value for #{k}: ", :yellow, :limited_to => options)
57
+ else
58
+ values[k] = options
59
+ end
60
+ log("selected '#{values[k]}'")
61
+ end
62
+ values
63
+ end
64
+
65
+ # use the base url and swap out values
66
+ def parse(values)
67
+ base = GUM::Pages::BASE_SEARCH
68
+ values.each{|k,v| base = base.gsub("%#{k.upcase}", v) if ! v.nil?}
69
+ base
70
+ end
71
+ # grab the base url with the %page param still set
72
+ def base_url()
73
+ @top_level = self.top_level_selection
74
+ log("selected '#{@top_level}'")
75
+ # request all the data underneath top level
76
+ @sub_level_values = self.get_values(@top_level).merge({"page"=>nil})
77
+ # now sub out values and get the url
78
+ @base_uri = self.parse(@sub_level_values)
79
+ @base_uri
80
+ end
81
+
82
+ # get the url of a page
83
+ def url(page)
84
+ @base_uri = self.base_url if @base_uri.nil?
85
+ @uri = @base_uri.gsub("%PAGE", page.to_s)
86
+ log("url '#{@uri}'")
87
+ @uri
88
+ end
89
+
90
+ # load the page up
91
+ def load_without_output(file)
92
+ @parser = Parser.new(files: [file], parallel: false, quiet: true)
93
+ self
94
+ end
95
+ # find values for attributes
96
+ def values(type)
97
+ body = GUM::Pages::SELECTORS[@top_level][type][:body]
98
+ selectors = GUM::Pages::SELECTORS[@top_level][type].select{ |k,v| k.to_s != "body" }
99
+ data = {}
100
+ @parser.parse! do
101
+ css(body) do |post|
102
+ selectors.each do |n,selector|
103
+ data[n] = []
104
+ post.search(selector).each{ |i| data[n].push(if n.to_s.index("link") then i.attributes["href"].value.gsub(GUM::Pages::DOMAIN, "") else i.content.strip end) }
105
+ end
106
+ end
107
+ end
108
+ # remove dups
109
+ data.each{|k,v| data[k] = v.first if k.to_s.index("link").nil? && v.class == Array }
110
+ data
111
+ end
112
+
113
+ # find max value
114
+ def max_page(file)
115
+ # find the max page
116
+ list_values = self.load_without_output(file).values("list")
117
+ max_page = list_values[:max_page].to_i
118
+ log("max page: '#{max_page}'")
119
+ max_page
120
+ end
121
+
122
+ # find all advert links
123
+ def advert_links(start, max_page)
124
+ adverts = []
125
+ range = (start..max_page).to_a
126
+ range.each do |i|
127
+ say("Page #{i}", :green)
128
+ url = self.url(i)
129
+ file = self.url_to_file(url, @proxy) if ! url.nil?
130
+ list_values = self.load_without_output(file).values("list") if ! file.nil?
131
+ log("found #{list_values[:item_links].length} adverts")
132
+ # merge together
133
+ adverts = adverts.concat(list_values[:item_links])
134
+ log("total length so far: #{adverts.length}")
135
+ end
136
+ adverts
137
+ end
138
+
139
+ # find the data for each advert
140
+ def advert_data(adverts, with)
141
+ data = []
142
+ adverts.each_with_index do |post, i|
143
+ url = GUM::Pages::DOMAIN + post
144
+ say("Ad #{i}: #{url}", :green)
145
+ file = self.url_to_file(url, @proxy)
146
+ values = self.load_without_output(file).values("advert").merge({url:url}) if ! file.nil?
147
+ data.push(values) if ! values[with].nil? && values[with].length > 0
148
+ sleep 0.2
149
+ end
150
+ data
151
+ end
152
+ }
153
+
154
+ end
155
+
156
+ end
@@ -0,0 +1,3 @@
1
+ module GUM
2
+ VERSION = "0.0.1"
3
+ end
data/lib/gum.rb ADDED
@@ -0,0 +1,13 @@
1
+ require "gum/version"
2
+ require "thor"
3
+ require "docparser"
4
+ require "open-uri"
5
+ require "uri"
6
+ require "csv"
7
+
8
+ module GUM
9
+ require "gum/base"
10
+ require "gum/pages"
11
+
12
+ require "gum/gum"
13
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gumtree_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Charles Marshall
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-11-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: docparser
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.2'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.2'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.5'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.5'
41
+ - !ruby/object:Gem::Dependency
42
+ name: thor
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.19'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.19'
55
+ description: " "
56
+ email:
57
+ - cm56marshall@gmail.com
58
+ executables:
59
+ - gumtree_scrape
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - ".gitignore"
64
+ - Gemfile
65
+ - Gemfile.lock
66
+ - LICENSE
67
+ - README.md
68
+ - bin/gumtree_scrape
69
+ - gumtree_scrape.gemspec
70
+ - lib/gum.rb
71
+ - lib/gum/base.rb
72
+ - lib/gum/gum.rb
73
+ - lib/gum/pages.rb
74
+ - lib/gum/version.rb
75
+ homepage: https://github.com/charlesmarshall/gumtree_scraper/tree/0.0.1
76
+ licenses:
77
+ - MIT
78
+ metadata: {}
79
+ post_install_message: ''
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '2.0'
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubyforge_project:
95
+ rubygems_version: 2.2.2
96
+ signing_key:
97
+ specification_version: 4
98
+ summary: ''
99
+ test_files: []