gumtree_scraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 42e2924d09e2b7f023235c3a1c0a824bb4af9f4b
4
+ data.tar.gz: aa76be8fe4d7974a0149ae08eceb92ec52ecebef
5
+ SHA512:
6
+ metadata.gz: 4f07747c6cb026f0028f802198cd2916a9c5ab1c1c6ff4364bc064a198aa6df7b71e8d3a21afa42f356e0714d672098428615f469207735900a9db02e3cf2fc0
7
+ data.tar.gz: 0fba809d91d3afbdfe2decc0dce1efa450ad9f019c5f7a3f4e618c588e9dc85ff0d722d84e343b4a67d6a5afb2cb6df076c1561c919fc25549c66d9b67092a59
data/.gitignore ADDED
@@ -0,0 +1,34 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /test/tmp/
9
+ /test/version_tmp/
10
+ /tmp/
11
+
12
+ ## Specific to RubyMotion:
13
+ .dat*
14
+ .repl_history
15
+ build/
16
+
17
+ ## Documentation cache and generated files:
18
+ /.yardoc/
19
+ /_yardoc/
20
+ /doc/
21
+ /rdoc/
22
+
23
+ ## Environment normalisation:
24
+ /.bundle/
25
+ /lib/bundler/man/
26
+
27
+ # for a library or gem, you might want to ignore these files since the code is
28
+ # intended to run in multiple environments; otherwise, check them in:
29
+ # Gemfile.lock
30
+ # .ruby-version
31
+ # .ruby-gemset
32
+
33
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
34
+ .rvmrc
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,34 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ gumtree_scraper (0.0.1)
5
+ bundler (~> 1.5)
6
+ docparser (~> 0.2)
7
+ thor (~> 0.19)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ axlsx (2.0.1)
13
+ htmlentities (~> 4.3.1)
14
+ nokogiri (>= 1.4.1)
15
+ rubyzip (~> 1.0.0)
16
+ docparser (0.2.3)
17
+ axlsx (~> 2.0.1)
18
+ log4r (~> 1.1.10)
19
+ nokogiri (~> 1.6.1)
20
+ parallel (~> 1.3.2)
21
+ htmlentities (4.3.2)
22
+ log4r (1.1.10)
23
+ mini_portile (0.6.1)
24
+ nokogiri (1.6.4)
25
+ mini_portile (~> 0.6.0)
26
+ parallel (1.3.3)
27
+ rubyzip (1.0.0)
28
+ thor (0.19.1)
29
+
30
+ PLATFORMS
31
+ ruby
32
+
33
+ DEPENDENCIES
34
+ gumtree_scraper!
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 charles marshall
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
data/README.md ADDED
@@ -0,0 +1,20 @@
1
+ gumtree_scrape
2
+ ==============
3
+
4
+ Install with:
5
+
6
+ `gem install gumtree_scrape`
7
+
8
+ Usage:
9
+
10
+ On terminal run:
11
+
12
+ `gumtree_scrape go`
13
+
14
+ If failing and IP has been blocked, there is a proxy built in. To use that, run:
15
+
16
+ `gumtree_scrape go --proxy`
17
+
18
+ At the moment only a single proxy is setup, http://anonymouse.org
19
+
20
+ When running it will ask a series of questions, just enter one the values offered.
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
5
+
6
+ require "gum"
7
+ GUM::Gum.start(ARGV)
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "gum/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "gumtree_scraper"
8
+ spec.version = GUM::VERSION
9
+ spec.authors = ["Charles Marshall"]
10
+ spec.email = ["cm56marshall@gmail.com"]
11
+ spec.summary = %q{ }
12
+ spec.description = %q{ }
13
+ spec.homepage = "https://github.com/charlesmarshall/gumtree_scraper/tree/#{GUM::VERSION}"
14
+ spec.license = "MIT"
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+ spec.required_ruby_version = '>= 2.0'
20
+
21
+ spec.add_dependency "docparser", "~> 0.2"
22
+ spec.add_dependency "bundler", "~> 1.5"
23
+ spec.add_dependency "thor", "~> 0.19"
24
+
25
+ spec.post_install_message = ""
26
+
27
+ end
data/lib/gum/base.rb ADDED
@@ -0,0 +1,33 @@
1
+ module GUM
2
+ class Base < Thor
3
+ include Thor::Shell
4
+
5
+ DEFAULT_PROXIES = [
6
+ "http://anonymouse.org/cgi-bin/anon-www.cgi/%URL",
7
+ ]
8
+
9
+ no_commands{
10
+ def log(out)
11
+ say("[#{Time.now.to_s}] #{out}", :black)
12
+ end
13
+
14
+ # fetch url and save to a local file for parsing
15
+ def url_to_file(url, proxi=nil)
16
+ url = URI.escape(url)
17
+ file = "/tmp/gum_#{Time.now.to_i.to_s}"
18
+ if proxi
19
+ proxy = GUM::Base::DEFAULT_PROXIES.sample
20
+ log("Using proxy: #{proxy}")
21
+ url = proxy.gsub("%URL", url)
22
+ File.open(file, "wb") { |f| f.write open(url).read.gsub(proxy.gsub("%URL", ""), "") }
23
+ else
24
+ File.open(file, "wb") { |f| f.write open(url).read }
25
+ end
26
+ log("'#{url}' saved to '#{file}'")
27
+ return file
28
+ end
29
+
30
+ }
31
+
32
+ end
33
+ end
data/lib/gum/gum.rb ADDED
@@ -0,0 +1,50 @@
1
+ module GUM
2
+ class Gum < Base
3
+ class_option :verbose, :type => :boolean, :default => true
4
+ class_option :proxy, :type => :boolean, :default => true
5
+
6
+
7
+ desc "go", "Run everything"
8
+ def go
9
+
10
+ # first off, ask what we want to download
11
+ page = GUM::Pages.new(options)
12
+ # base url without the page number subbed out
13
+ url = page.url(1)
14
+ # now check if the page works
15
+ file = self.url_to_file(url, options["proxy"])
16
+ if File.exists?(file)
17
+ max_page = page.max_page(file)
18
+ start_page = ask("Enter page to start at: ", :yellow)
19
+ start_page = 1 if start_page.nil?
20
+ asked_pages = ask("Enter page to end at (found #{max_page}) : ", :yellow)
21
+ max_page = asked_pages.to_i if ! asked_pages.nil? && asked_pages != "y"
22
+ log("Using #{start_page}-#{max_page} as max page")
23
+
24
+ adverts = page.advert_links(start_page.to_i ,max_page)
25
+ say("Found #{adverts.length} adverts", :green)
26
+ # now fetch data for each link
27
+ processed = page.advert_data(adverts, :telephone)
28
+ say("Found #{processed.length} with phone numbers", :green)
29
+ # output to a csv
30
+ path = Dir.pwd+"/gumtree_p#{start_page}_p#{max_page}_#{Time.now.to_i.to_s}.csv"
31
+
32
+ CSV.open(path, "wb") do |csvfile|
33
+ processed.each do|v|
34
+ log("Saving #{v[:url]}")
35
+ csvfile << v.values if ! v.nil?
36
+ end
37
+ end
38
+ # clean up
39
+ say("Saved to : #{path}", :green)
40
+ `rm /tmp/gum_* 2>&1 /dev/null`
41
+ say("Complete", :blue)
42
+ else
43
+ say("Failed to download page", :red)
44
+ exit 1
45
+ end
46
+ end
47
+
48
+
49
+ end
50
+ end
data/lib/gum/pages.rb ADDED
@@ -0,0 +1,156 @@
1
+ module GUM
2
+ class Pages < Base
3
+ include DocParser
4
+ attr_accessor :top_level, :sub_level_values, :base_uri, :uri, :proxy
5
+ def initialize(options)
6
+ @proxy = nil
7
+ options.each do |k,v|
8
+ self.class.send(:attr_accessor, k)
9
+ self.instance_variable_set("@#{k}",v)
10
+ end
11
+ end
12
+
13
+ # gum tree page info
14
+ DOMAIN = "http://www.gumtree.com"
15
+ BASE_SEARCH = "#{GUM::Pages::DOMAIN}/search?q=&search_category=%CATEGORY&search_location=%LOCATION&distance=%DISTANCE&seller_type=%SELLER_TYPE&page=%PAGE"
16
+ LISTING_PAGES ={
17
+ "property" => {
18
+ "category" => "flatshare",
19
+ "location" => "birmingham",
20
+ "distance" => ["10", "50", "100", "500", "1000"],
21
+ "seller_type" => ["private", "trade"],
22
+ }
23
+ }
24
+
25
+ # page data
26
+ SELECTORS = {
27
+ "property" => {
28
+ "list" => {
29
+ :body => "#fullListings",
30
+ :max_page => "li.page-last a",
31
+ # anything with a link in the key name is presumed that you want the href value, not the content
32
+ :item_links => "a.listing-link"
33
+ },
34
+ "advert" => {
35
+ :body => ".main main",
36
+ :name => "h1.space-mbs",
37
+ :price => ".ad-price",
38
+ :telephone => ".truncate-number",
39
+ :description => "p.ad-description"
40
+ }
41
+ }
42
+ }
43
+
44
+ no_commands {
45
+ # select the top level versions
46
+ def top_level_selection
47
+ ask("Please select what type of search page you want to scrape:", :yellow, :limited_to =>GUM::Pages::LISTING_PAGES.keys)
48
+ end
49
+
50
+ # find the real values for each item in side the sub list
51
+ def get_values(section)
52
+ data = GUM::Pages::LISTING_PAGES[section]
53
+ values = {}
54
+ data.each do |k, options|
55
+ if options.class == Array
56
+ values[k] = ask("Enter value for #{k}: ", :yellow, :limited_to => options)
57
+ else
58
+ values[k] = options
59
+ end
60
+ log("selected '#{values[k]}'")
61
+ end
62
+ values
63
+ end
64
+
65
+ # use the base url and swap out values
66
+ def parse(values)
67
+ base = GUM::Pages::BASE_SEARCH
68
+ values.each{|k,v| base = base.gsub("%#{k.upcase}", v) if ! v.nil?}
69
+ base
70
+ end
71
+ # grab the base url with the %page param still set
72
+ def base_url()
73
+ @top_level = self.top_level_selection
74
+ log("selected '#{@top_level}'")
75
+ # request all the data underneath top level
76
+ @sub_level_values = self.get_values(@top_level).merge({"page"=>nil})
77
+ # now sub out values and get the url
78
+ @base_uri = self.parse(@sub_level_values)
79
+ @base_uri
80
+ end
81
+
82
+ # get the url of a page
83
+ def url(page)
84
+ @base_uri = self.base_url if @base_uri.nil?
85
+ @uri = @base_uri.gsub("%PAGE", page.to_s)
86
+ log("url '#{@uri}'")
87
+ @uri
88
+ end
89
+
90
+ # load the page up
91
+ def load_without_output(file)
92
+ @parser = Parser.new(files: [file], parallel: false, quiet: true)
93
+ self
94
+ end
95
+ # find values for attributes
96
+ def values(type)
97
+ body = GUM::Pages::SELECTORS[@top_level][type][:body]
98
+ selectors = GUM::Pages::SELECTORS[@top_level][type].select{ |k,v| k.to_s != "body" }
99
+ data = {}
100
+ @parser.parse! do
101
+ css(body) do |post|
102
+ selectors.each do |n,selector|
103
+ data[n] = []
104
+ post.search(selector).each{ |i| data[n].push(if n.to_s.index("link") then i.attributes["href"].value.gsub(GUM::Pages::DOMAIN, "") else i.content.strip end) }
105
+ end
106
+ end
107
+ end
108
+ # remove dups
109
+ data.each{|k,v| data[k] = v.first if k.to_s.index("link").nil? && v.class == Array }
110
+ data
111
+ end
112
+
113
+ # find max value
114
+ def max_page(file)
115
+ # find the max page
116
+ list_values = self.load_without_output(file).values("list")
117
+ max_page = list_values[:max_page].to_i
118
+ log("max page: '#{max_page}'")
119
+ max_page
120
+ end
121
+
122
+ # find all advert links
123
+ def advert_links(start, max_page)
124
+ adverts = []
125
+ range = (start..max_page).to_a
126
+ range.each do |i|
127
+ say("Page #{i}", :green)
128
+ url = self.url(i)
129
+ file = self.url_to_file(url, @proxy) if ! url.nil?
130
+ list_values = self.load_without_output(file).values("list") if ! file.nil?
131
+ log("found #{list_values[:item_links].length} adverts")
132
+ # merge together
133
+ adverts = adverts.concat(list_values[:item_links])
134
+ log("total length so far: #{adverts.length}")
135
+ end
136
+ adverts
137
+ end
138
+
139
+ # find the data for each advert
140
+ def advert_data(adverts, with)
141
+ data = []
142
+ adverts.each_with_index do |post, i|
143
+ url = GUM::Pages::DOMAIN + post
144
+ say("Ad #{i}: #{url}", :green)
145
+ file = self.url_to_file(url, @proxy)
146
+ values = self.load_without_output(file).values("advert").merge({url:url}) if ! file.nil?
147
+ data.push(values) if ! values[with].nil? && values[with].length > 0
148
+ sleep 0.2
149
+ end
150
+ data
151
+ end
152
+ }
153
+
154
+ end
155
+
156
+ end
@@ -0,0 +1,3 @@
1
+ module GUM
2
+ VERSION = "0.0.1"
3
+ end
data/lib/gum.rb ADDED
@@ -0,0 +1,13 @@
1
+ require "gum/version"
2
+ require "thor"
3
+ require "docparser"
4
+ require "open-uri"
5
+ require "uri"
6
+ require "csv"
7
+
8
+ module GUM
9
+ require "gum/base"
10
+ require "gum/pages"
11
+
12
+ require "gum/gum"
13
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gumtree_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Charles Marshall
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-11-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: docparser
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.2'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.2'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.5'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.5'
41
+ - !ruby/object:Gem::Dependency
42
+ name: thor
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.19'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.19'
55
+ description: " "
56
+ email:
57
+ - cm56marshall@gmail.com
58
+ executables:
59
+ - gumtree_scrape
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - ".gitignore"
64
+ - Gemfile
65
+ - Gemfile.lock
66
+ - LICENSE
67
+ - README.md
68
+ - bin/gumtree_scrape
69
+ - gumtree_scrape.gemspec
70
+ - lib/gum.rb
71
+ - lib/gum/base.rb
72
+ - lib/gum/gum.rb
73
+ - lib/gum/pages.rb
74
+ - lib/gum/version.rb
75
+ homepage: https://github.com/charlesmarshall/gumtree_scraper/tree/0.0.1
76
+ licenses:
77
+ - MIT
78
+ metadata: {}
79
+ post_install_message: ''
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '2.0'
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubyforge_project:
95
+ rubygems_version: 2.2.2
96
+ signing_key:
97
+ specification_version: 4
98
+ summary: ''
99
+ test_files: []