craigslister 2.0.0 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/craigslister/craigslister.rb +52 -0
- data/lib/craigslister/post.rb +13 -0
- data/lib/craigslister/post_scraper.rb +49 -0
- data/lib/craigslister/scraper.rb +39 -0
- data/lib/craigslister.rb +4 -105
- metadata +8 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 83f273dd97db5b9e9322851b3b69a5818194c39e
|
4
|
+
data.tar.gz: a53505d6d77de5be42ff950c88d9379289fb3e8a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 813c4dc83d6f84642167220e7f8f25a5bff5aef84bd491b12a8065de2ff20a49f4645743169853645ced314de9a73595d8788857ea43aa1e5977acd81582521e
|
7
|
+
data.tar.gz: 7cac96de0d2b7852b90d9d57ba4a045ab5641e3f36b2c4a264c8a9f26c7722a5af1c34e151f7ca1f82be2eee85aec91b9c1db1b8230ccaac7b12f827eb8b5b4e
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# Thrown when low price is higher than high price
|
2
|
+
class InvalidRangeError < StandardError
|
3
|
+
end
|
4
|
+
|
5
|
+
# Creates url from arguments and scrapes
|
6
|
+
class Craigslister
|
7
|
+
attr_reader :area, :item, :high, :low
|
8
|
+
|
9
|
+
def initialize(args)
|
10
|
+
@area = args.fetch(:area, 'sfbay')
|
11
|
+
@item = args[:item]
|
12
|
+
@high = args.fetch(:high, nil)
|
13
|
+
@low = args.fetch(:low, nil)
|
14
|
+
validate_price_range
|
15
|
+
end
|
16
|
+
|
17
|
+
def scrape
|
18
|
+
scraper.scrape
|
19
|
+
end
|
20
|
+
|
21
|
+
def links
|
22
|
+
scraper.links
|
23
|
+
end
|
24
|
+
|
25
|
+
def url
|
26
|
+
"#{base_url}/search/sss?sort=rel&"\
|
27
|
+
"#{price_query}query="\
|
28
|
+
"#{item.downcase.split(' ') * '+'}"
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def scraper
|
34
|
+
Scraper.new(url, base_url)
|
35
|
+
end
|
36
|
+
|
37
|
+
def base_url
|
38
|
+
"https://#{area}.craigslist.org"
|
39
|
+
end
|
40
|
+
|
41
|
+
def price_query
|
42
|
+
result = ''
|
43
|
+
result += "min_price=#{low}&" if low
|
44
|
+
result += "max_price=#{high}&" if high
|
45
|
+
result
|
46
|
+
end
|
47
|
+
|
48
|
+
def validate_price_range
|
49
|
+
return unless low && high && low > high
|
50
|
+
fail(InvalidRangeError, 'Price range is invalid.')
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# Used for packaging Craigslist post data
|
2
|
+
class Post
|
3
|
+
attr_reader :title, :image, :price, :location, :url, :description
|
4
|
+
|
5
|
+
def initialize(args)
|
6
|
+
@title = args[:title]
|
7
|
+
@image = args[:image]
|
8
|
+
@price = args[:price]
|
9
|
+
@location = args[:location]
|
10
|
+
@description = args[:description]
|
11
|
+
@url = args[:url]
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# Creates Post objects out of an HTML page
|
2
|
+
class PostScraper
|
3
|
+
def initialize(page, link)
|
4
|
+
@page = page
|
5
|
+
@link = link
|
6
|
+
end
|
7
|
+
|
8
|
+
def new_post
|
9
|
+
Post.new(
|
10
|
+
image: image,
|
11
|
+
title: title,
|
12
|
+
price: price,
|
13
|
+
location: location,
|
14
|
+
description: description,
|
15
|
+
url: link
|
16
|
+
)
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
attr_reader :page, :link
|
22
|
+
|
23
|
+
def posting_title
|
24
|
+
page.at('span.postingtitletext')
|
25
|
+
end
|
26
|
+
|
27
|
+
def image
|
28
|
+
image = page.at('img')
|
29
|
+
image ? image['src'] : ''
|
30
|
+
end
|
31
|
+
|
32
|
+
def title
|
33
|
+
posting_title.text.gsub(/ ?- ?\$\d+ ?\(.+\)/, '')
|
34
|
+
end
|
35
|
+
|
36
|
+
def price
|
37
|
+
price = posting_title.at('span.price')
|
38
|
+
price ? price.text.gsub(/\$/, '').to_i : 0
|
39
|
+
end
|
40
|
+
|
41
|
+
def location
|
42
|
+
location = posting_title.at('small')
|
43
|
+
location ? location.text.gsub(/ ?[\(\)]/, '') : ''
|
44
|
+
end
|
45
|
+
|
46
|
+
def description
|
47
|
+
page.at('section#postingbody').text
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Houses all higher level scraping logic
|
2
|
+
class Scraper
|
3
|
+
def initialize(url, base_url)
|
4
|
+
@url = url
|
5
|
+
@base_url = base_url
|
6
|
+
end
|
7
|
+
|
8
|
+
def links
|
9
|
+
header_link.map { |link| format_link(link['href']) }
|
10
|
+
end
|
11
|
+
|
12
|
+
def scrape
|
13
|
+
links.flat_map { |link| post_from(link) }
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
attr_reader :url, :base_url
|
19
|
+
|
20
|
+
def page_from(url)
|
21
|
+
Nokogiri::HTML(open(url))
|
22
|
+
end
|
23
|
+
|
24
|
+
def post_from(link)
|
25
|
+
PostScraper.new(page_from(link), link).new_post
|
26
|
+
end
|
27
|
+
|
28
|
+
def header_link
|
29
|
+
page_from(url).css('.hdrlnk')
|
30
|
+
end
|
31
|
+
|
32
|
+
def format_link(link)
|
33
|
+
if link =~ /\w+\.craig/
|
34
|
+
'https:' + link
|
35
|
+
else
|
36
|
+
base_url + link
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/craigslister.rb
CHANGED
@@ -1,107 +1,6 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'open-uri'
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
class Craigslister
|
10
|
-
attr_reader :area, :item, :high, :low
|
11
|
-
|
12
|
-
def initialize args
|
13
|
-
@area = args.fetch(:area, 'sfbay')
|
14
|
-
@item = args[:item]
|
15
|
-
@high = args.fetch(:high, nil)
|
16
|
-
@low = args.fetch(:low, nil)
|
17
|
-
validate_price_range
|
18
|
-
end
|
19
|
-
|
20
|
-
def scrape!
|
21
|
-
links.map {|link| item_from(link)}.compact
|
22
|
-
end
|
23
|
-
|
24
|
-
def links
|
25
|
-
page_from(url).css('.hdrlnk').map {|link| format_link(link)}
|
26
|
-
end
|
27
|
-
|
28
|
-
def url
|
29
|
-
"#{base_url}/search/sss?sort=rel&"\
|
30
|
-
"#{price_query}query="\
|
31
|
-
"#{item.downcase.split(' ') * '+'}"
|
32
|
-
end
|
33
|
-
|
34
|
-
|
35
|
-
private
|
36
|
-
def base_url
|
37
|
-
"https://#{area}.craigslist.org"
|
38
|
-
end
|
39
|
-
|
40
|
-
def page_from url
|
41
|
-
Nokogiri::HTML(open(url))
|
42
|
-
end
|
43
|
-
|
44
|
-
def format_link link
|
45
|
-
link['href'] =~ /\w+\.craig/ ? "https:" + link['href'] : base_url + link['href']
|
46
|
-
end
|
47
|
-
|
48
|
-
def price_query
|
49
|
-
result = ''
|
50
|
-
result += "min_price=#{low}&" if low
|
51
|
-
result += "max_price=#{high}&" if high
|
52
|
-
result
|
53
|
-
end
|
54
|
-
|
55
|
-
def validate_price_range
|
56
|
-
raise InvalidRangeError if low && high && low > high
|
57
|
-
end
|
58
|
-
|
59
|
-
def item_from link
|
60
|
-
Item.new(get_item_data(page_from(link), link))
|
61
|
-
end
|
62
|
-
|
63
|
-
def get_item_data page, link
|
64
|
-
{
|
65
|
-
image: scrape_image(page),
|
66
|
-
title: page.at('span.postingtitletext').text.gsub(/ ?- ?\$\d+ ?\(.+\)/, ''),
|
67
|
-
price: scrape_price(page),
|
68
|
-
location: scrape_location(page),
|
69
|
-
description: page.at('section#postingbody').text,
|
70
|
-
url: link
|
71
|
-
}
|
72
|
-
end
|
73
|
-
|
74
|
-
def scrape_image page
|
75
|
-
page.at('img') ? page.at('img')['src'] : ""
|
76
|
-
end
|
77
|
-
|
78
|
-
def scrape_price page
|
79
|
-
if price = page.at('span.postingtitletext span.price')
|
80
|
-
price.text.gsub(/\$/,'').to_i
|
81
|
-
else
|
82
|
-
0
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
def scrape_location page
|
87
|
-
if location = page.at('span.postingtitletext small')
|
88
|
-
location.text.gsub(/ ?[\(\)]/,'')
|
89
|
-
else
|
90
|
-
""
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
class Item
|
98
|
-
attr_reader :title, :image, :price, :location, :url
|
99
|
-
|
100
|
-
def initialize args
|
101
|
-
@title = args[:title]
|
102
|
-
@image = args[:image]
|
103
|
-
@price = args[:price]
|
104
|
-
@location = args[:location]
|
105
|
-
@url = args[:url]
|
106
|
-
end
|
107
|
-
end
|
3
|
+
require 'craigslister/scraper'
|
4
|
+
require 'craigslister/post_scraper'
|
5
|
+
require 'craigslister/post'
|
6
|
+
require 'craigslister/craigslister'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: craigslister
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Scott
|
@@ -30,13 +30,17 @@ dependencies:
|
|
30
30
|
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.6.6.2
|
33
|
-
description: all you need is an item title and you can scrape
|
33
|
+
description: all you need is an item title and you can scrape posts from craigslist
|
34
34
|
email: christo247@gmail.com
|
35
35
|
executables: []
|
36
36
|
extensions: []
|
37
37
|
extra_rdoc_files: []
|
38
38
|
files:
|
39
39
|
- lib/craigslister.rb
|
40
|
+
- lib/craigslister/craigslister.rb
|
41
|
+
- lib/craigslister/post.rb
|
42
|
+
- lib/craigslister/post_scraper.rb
|
43
|
+
- lib/craigslister/scraper.rb
|
40
44
|
homepage: https://github.com/Yago580/craigslister
|
41
45
|
licenses:
|
42
46
|
- MIT
|
@@ -57,8 +61,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
57
61
|
version: '0'
|
58
62
|
requirements: []
|
59
63
|
rubyforge_project:
|
60
|
-
rubygems_version: 2.4.5
|
64
|
+
rubygems_version: 2.4.5.1
|
61
65
|
signing_key:
|
62
66
|
specification_version: 4
|
63
|
-
summary: Scrape Craigslist for
|
67
|
+
summary: Scrape Craigslist for Posts
|
64
68
|
test_files: []
|