open_events 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/README +3 -0
- data/Rakefile +7 -0
- data/boston/booksmith.rb +39 -0
- data/boston/brattle.rb +34 -0
- data/lib/cached_html.rb +27 -0
- data/lib/event_scraper.rb +33 -0
- data/lib/runner.rb +29 -0
- data/notes.txt +9 -0
- data/open_events.gemspec +25 -0
- metadata +76 -0
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
html_cache/
|
data/README
ADDED
data/Rakefile
ADDED
data/boston/booksmith.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'event_scraper'
|
2
|
+
|
3
|
+
class Booksmith < EventScraper
|
4
|
+
|
5
|
+
def about
|
6
|
+
{
|
7
|
+
title: 'Brookline Booksmith',
|
8
|
+
url: 'http://www.brooklinebooksmith.com/events/mainevent.html',
|
9
|
+
categories: %w(books speakers),
|
10
|
+
locations: %w(coolidge-corner)
|
11
|
+
}
|
12
|
+
end
|
13
|
+
|
14
|
+
def nodes
|
15
|
+
doc.search('strong a').
|
16
|
+
select {|x|
|
17
|
+
x['href'] =~ %r{^http://www.brooklinebooksmith-shop.com/event/}}.
|
18
|
+
map {|x|
|
19
|
+
x.ancestors.detect {|y| y.name == 'tr'}}
|
20
|
+
end
|
21
|
+
|
22
|
+
def event(n)
|
23
|
+
return n.inner_html
|
24
|
+
date = if (x = n.at('.entry-meta'))
|
25
|
+
x.inner_text
|
26
|
+
else
|
27
|
+
@res.last && @res.last[:date]
|
28
|
+
end
|
29
|
+
time = (x = n.at('li/text()')) && x.text.strip
|
30
|
+
link = n.at('.entry-title a')['href']
|
31
|
+
{
|
32
|
+
date: date,
|
33
|
+
time: time,
|
34
|
+
title: n.at('.entry-title').inner_text,
|
35
|
+
link: link
|
36
|
+
}
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
data/boston/brattle.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'event_scraper'
|
2
|
+
|
3
|
+
class Brattle < EventScraper
|
4
|
+
|
5
|
+
def about
|
6
|
+
{
|
7
|
+
title: 'Brattle Theater Special Events',
|
8
|
+
url: 'http://brattlefilm.org/category/calendar-2/special-events/',
|
9
|
+
categories: %w(movies),
|
10
|
+
locations: %w(harvard-square)
|
11
|
+
}
|
12
|
+
end
|
13
|
+
|
14
|
+
def nodes
|
15
|
+
doc.at('#calendarframe').xpath('./div')
|
16
|
+
end
|
17
|
+
|
18
|
+
def event(n)
|
19
|
+
date = if (x = n.at('.entry-meta'))
|
20
|
+
x.inner_text
|
21
|
+
else
|
22
|
+
@res.last && @res.last[:date]
|
23
|
+
end
|
24
|
+
time = (x = n.at('li/text()')) && x.text.strip
|
25
|
+
link = n.at('.entry-title a')['href']
|
26
|
+
{
|
27
|
+
date: date,
|
28
|
+
time: time,
|
29
|
+
title: n.at('.entry-title').inner_text,
|
30
|
+
link: link
|
31
|
+
}
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
data/lib/cached_html.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
module CachedHtml
|
3
|
+
|
4
|
+
CACHE_DIR = 'html_cache'
|
5
|
+
`mkdir -p #{CACHE_DIR}`
|
6
|
+
|
7
|
+
def filename(url)
|
8
|
+
File.join(CACHE_DIR, munge(url))
|
9
|
+
end
|
10
|
+
|
11
|
+
def munge(url)
|
12
|
+
url.sub("http://", '').sub(/\W$/, '').gsub('/', '.').gsub(/\W/, '-')
|
13
|
+
end
|
14
|
+
|
15
|
+
def cached_html(url)
|
16
|
+
if File.size?(filename(url))
|
17
|
+
File.read(filename(url))
|
18
|
+
else
|
19
|
+
puts "Fetching #{url}"
|
20
|
+
res = open(url)
|
21
|
+
html = res.read
|
22
|
+
File.open(filename(url), 'w') {|f| f.write html}
|
23
|
+
puts "Cached html to #{filename(url)}"
|
24
|
+
html
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'cached_html'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
class EventScraper
|
5
|
+
include CachedHtml
|
6
|
+
attr_accessor :doc
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@doc = Nokogiri::HTML.parse(self.html)
|
10
|
+
end
|
11
|
+
|
12
|
+
def html
|
13
|
+
@html ||= cached_html(about[:url])
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse
|
17
|
+
@res = []
|
18
|
+
nodes.map {|n|
|
19
|
+
@res << event(n)
|
20
|
+
}
|
21
|
+
@res
|
22
|
+
end
|
23
|
+
|
24
|
+
def parse_test
|
25
|
+
nodes.each {|n|
|
26
|
+
puts n.inner_html
|
27
|
+
puts n.inner_text.gsub(/\s{2,}/, ' ').strip
|
28
|
+
puts '-' * 80
|
29
|
+
}
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
data/lib/runner.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
$LOAD_PATH.unshift('scrapers')
|
2
|
+
|
3
|
+
# TODO use opt parse to output HTML fragments for interative development
|
4
|
+
|
5
|
+
|
6
|
+
class Runner
|
7
|
+
def initialize(venue)
|
8
|
+
require venue
|
9
|
+
klass = camelize(venue)
|
10
|
+
puts klass
|
11
|
+
parser = Object.const_get(klass).new
|
12
|
+
puts parser.about.inspect
|
13
|
+
res = parser.parse
|
14
|
+
puts res
|
15
|
+
end
|
16
|
+
|
17
|
+
# from active support
|
18
|
+
def camelize(lower_case_and_underscored_word, first_letter_in_uppercase = true)
|
19
|
+
if first_letter_in_uppercase
|
20
|
+
lower_case_and_underscored_word.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
|
21
|
+
else
|
22
|
+
lower_case_and_underscored_word.to_s[0].chr.downcase + camelize(lower_case_and_underscored_word)[1..-1]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
if __FILE__ == $0
|
28
|
+
Runner.new(ARGV.first)
|
29
|
+
end
|
data/notes.txt
ADDED
data/open_events.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
#require "open_events/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "open_events"
|
7
|
+
s.version = "0.0.1"
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.required_ruby_version = '>= 1.9.0'
|
10
|
+
|
11
|
+
s.authors = ["Daniel Choi"]
|
12
|
+
s.email = ["dhchoi@gmail.com"]
|
13
|
+
#s.homepage = "http://danielchoi.com/software/open_events.html"
|
14
|
+
s.summary = %q{Events listings web scrapers and tools}
|
15
|
+
s.description = %q{An open-source repository of events listings web scrapers and tools}
|
16
|
+
|
17
|
+
s.rubyforge_project = "open_events"
|
18
|
+
|
19
|
+
s.files = `git ls-files`.split("\n")
|
20
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
21
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
22
|
+
s.require_paths = ["lib"]
|
23
|
+
|
24
|
+
s.add_dependency 'nokogiri'
|
25
|
+
end
|
metadata
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: open_events
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.1
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Daniel Choi
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-04-18 00:00:00 -04:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: nokogiri
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: "0"
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
description: An open-source repository of events listings web scrapers and tools
|
28
|
+
email:
|
29
|
+
- dhchoi@gmail.com
|
30
|
+
executables: []
|
31
|
+
|
32
|
+
extensions: []
|
33
|
+
|
34
|
+
extra_rdoc_files: []
|
35
|
+
|
36
|
+
files:
|
37
|
+
- .gitignore
|
38
|
+
- README
|
39
|
+
- Rakefile
|
40
|
+
- boston/booksmith.rb
|
41
|
+
- boston/brattle.rb
|
42
|
+
- lib/cached_html.rb
|
43
|
+
- lib/event_scraper.rb
|
44
|
+
- lib/runner.rb
|
45
|
+
- notes.txt
|
46
|
+
- open_events.gemspec
|
47
|
+
has_rdoc: true
|
48
|
+
homepage:
|
49
|
+
licenses: []
|
50
|
+
|
51
|
+
post_install_message:
|
52
|
+
rdoc_options: []
|
53
|
+
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.9.0
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: "0"
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project: open_events
|
71
|
+
rubygems_version: 1.6.1
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: Events listings web scrapers and tools
|
75
|
+
test_files: []
|
76
|
+
|