open_events 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/README +3 -0
- data/Rakefile +7 -0
- data/boston/booksmith.rb +39 -0
- data/boston/brattle.rb +34 -0
- data/lib/cached_html.rb +27 -0
- data/lib/event_scraper.rb +33 -0
- data/lib/runner.rb +29 -0
- data/notes.txt +9 -0
- data/open_events.gemspec +25 -0
- metadata +76 -0
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
html_cache/
|
data/README
ADDED
data/Rakefile
ADDED
data/boston/booksmith.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'event_scraper'
|
2
|
+
|
3
|
+
class Booksmith < EventScraper
|
4
|
+
|
5
|
+
def about
|
6
|
+
{
|
7
|
+
title: 'Brookline Booksmith',
|
8
|
+
url: 'http://www.brooklinebooksmith.com/events/mainevent.html',
|
9
|
+
categories: %w(books speakers),
|
10
|
+
locations: %w(coolidge-corner)
|
11
|
+
}
|
12
|
+
end
|
13
|
+
|
14
|
+
def nodes
|
15
|
+
doc.search('strong a').
|
16
|
+
select {|x|
|
17
|
+
x['href'] =~ %r{^http://www.brooklinebooksmith-shop.com/event/}}.
|
18
|
+
map {|x|
|
19
|
+
x.ancestors.detect {|y| y.name == 'tr'}}
|
20
|
+
end
|
21
|
+
|
22
|
+
def event(n)
|
23
|
+
return n.inner_html
|
24
|
+
date = if (x = n.at('.entry-meta'))
|
25
|
+
x.inner_text
|
26
|
+
else
|
27
|
+
@res.last && @res.last[:date]
|
28
|
+
end
|
29
|
+
time = (x = n.at('li/text()')) && x.text.strip
|
30
|
+
link = n.at('.entry-title a')['href']
|
31
|
+
{
|
32
|
+
date: date,
|
33
|
+
time: time,
|
34
|
+
title: n.at('.entry-title').inner_text,
|
35
|
+
link: link
|
36
|
+
}
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
data/boston/brattle.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'event_scraper'
|
2
|
+
|
3
|
+
class Brattle < EventScraper
|
4
|
+
|
5
|
+
def about
|
6
|
+
{
|
7
|
+
title: 'Brattle Theater Special Events',
|
8
|
+
url: 'http://brattlefilm.org/category/calendar-2/special-events/',
|
9
|
+
categories: %w(movies),
|
10
|
+
locations: %w(harvard-square)
|
11
|
+
}
|
12
|
+
end
|
13
|
+
|
14
|
+
def nodes
|
15
|
+
doc.at('#calendarframe').xpath('./div')
|
16
|
+
end
|
17
|
+
|
18
|
+
def event(n)
|
19
|
+
date = if (x = n.at('.entry-meta'))
|
20
|
+
x.inner_text
|
21
|
+
else
|
22
|
+
@res.last && @res.last[:date]
|
23
|
+
end
|
24
|
+
time = (x = n.at('li/text()')) && x.text.strip
|
25
|
+
link = n.at('.entry-title a')['href']
|
26
|
+
{
|
27
|
+
date: date,
|
28
|
+
time: time,
|
29
|
+
title: n.at('.entry-title').inner_text,
|
30
|
+
link: link
|
31
|
+
}
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
data/lib/cached_html.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
module CachedHtml
|
3
|
+
|
4
|
+
CACHE_DIR = 'html_cache'
|
5
|
+
`mkdir -p #{CACHE_DIR}`
|
6
|
+
|
7
|
+
def filename(url)
|
8
|
+
File.join(CACHE_DIR, munge(url))
|
9
|
+
end
|
10
|
+
|
11
|
+
def munge(url)
|
12
|
+
url.sub("http://", '').sub(/\W$/, '').gsub('/', '.').gsub(/\W/, '-')
|
13
|
+
end
|
14
|
+
|
15
|
+
def cached_html(url)
|
16
|
+
if File.size?(filename(url))
|
17
|
+
File.read(filename(url))
|
18
|
+
else
|
19
|
+
puts "Fetching #{url}"
|
20
|
+
res = open(url)
|
21
|
+
html = res.read
|
22
|
+
File.open(filename(url), 'w') {|f| f.write html}
|
23
|
+
puts "Cached html to #{filename(url)}"
|
24
|
+
html
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'cached_html'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
class EventScraper
|
5
|
+
include CachedHtml
|
6
|
+
attr_accessor :doc
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@doc = Nokogiri::HTML.parse(self.html)
|
10
|
+
end
|
11
|
+
|
12
|
+
def html
|
13
|
+
@html ||= cached_html(about[:url])
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse
|
17
|
+
@res = []
|
18
|
+
nodes.map {|n|
|
19
|
+
@res << event(n)
|
20
|
+
}
|
21
|
+
@res
|
22
|
+
end
|
23
|
+
|
24
|
+
def parse_test
|
25
|
+
nodes.each {|n|
|
26
|
+
puts n.inner_html
|
27
|
+
puts n.inner_text.gsub(/\s{2,}/, ' ').strip
|
28
|
+
puts '-' * 80
|
29
|
+
}
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
data/lib/runner.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
$LOAD_PATH.unshift('scrapers')
|
2
|
+
|
3
|
+
# TODO use opt parse to output HTML fragments for interative development
|
4
|
+
|
5
|
+
|
6
|
+
class Runner
|
7
|
+
def initialize(venue)
|
8
|
+
require venue
|
9
|
+
klass = camelize(venue)
|
10
|
+
puts klass
|
11
|
+
parser = Object.const_get(klass).new
|
12
|
+
puts parser.about.inspect
|
13
|
+
res = parser.parse
|
14
|
+
puts res
|
15
|
+
end
|
16
|
+
|
17
|
+
# from active support
|
18
|
+
def camelize(lower_case_and_underscored_word, first_letter_in_uppercase = true)
|
19
|
+
if first_letter_in_uppercase
|
20
|
+
lower_case_and_underscored_word.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
|
21
|
+
else
|
22
|
+
lower_case_and_underscored_word.to_s[0].chr.downcase + camelize(lower_case_and_underscored_word)[1..-1]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
if __FILE__ == $0
|
28
|
+
Runner.new(ARGV.first)
|
29
|
+
end
|
data/notes.txt
ADDED
data/open_events.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
#require "open_events/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "open_events"
|
7
|
+
s.version = "0.0.1"
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.required_ruby_version = '>= 1.9.0'
|
10
|
+
|
11
|
+
s.authors = ["Daniel Choi"]
|
12
|
+
s.email = ["dhchoi@gmail.com"]
|
13
|
+
#s.homepage = "http://danielchoi.com/software/open_events.html"
|
14
|
+
s.summary = %q{Events listings web scrapers and tools}
|
15
|
+
s.description = %q{An open-source repository of events listings web scrapers and tools}
|
16
|
+
|
17
|
+
s.rubyforge_project = "open_events"
|
18
|
+
|
19
|
+
s.files = `git ls-files`.split("\n")
|
20
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
21
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
22
|
+
s.require_paths = ["lib"]
|
23
|
+
|
24
|
+
s.add_dependency 'nokogiri'
|
25
|
+
end
|
metadata
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: open_events
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.1
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Daniel Choi
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-04-18 00:00:00 -04:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: nokogiri
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: "0"
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
description: An open-source repository of events listings web scrapers and tools
|
28
|
+
email:
|
29
|
+
- dhchoi@gmail.com
|
30
|
+
executables: []
|
31
|
+
|
32
|
+
extensions: []
|
33
|
+
|
34
|
+
extra_rdoc_files: []
|
35
|
+
|
36
|
+
files:
|
37
|
+
- .gitignore
|
38
|
+
- README
|
39
|
+
- Rakefile
|
40
|
+
- boston/booksmith.rb
|
41
|
+
- boston/brattle.rb
|
42
|
+
- lib/cached_html.rb
|
43
|
+
- lib/event_scraper.rb
|
44
|
+
- lib/runner.rb
|
45
|
+
- notes.txt
|
46
|
+
- open_events.gemspec
|
47
|
+
has_rdoc: true
|
48
|
+
homepage:
|
49
|
+
licenses: []
|
50
|
+
|
51
|
+
post_install_message:
|
52
|
+
rdoc_options: []
|
53
|
+
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.9.0
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: "0"
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project: open_events
|
71
|
+
rubygems_version: 1.6.1
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: Events listings web scrapers and tools
|
75
|
+
test_files: []
|
76
|
+
|