open_events 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ html_cache/
data/README ADDED
@@ -0,0 +1,3 @@
1
+ # Open Events
2
+
3
+ Description coming
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'bundler'
4
+ Bundler::GemHelper.install_tasks
5
+
6
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), 'lib')
7
+
@@ -0,0 +1,39 @@
1
+ require 'event_scraper'
2
+
3
+ class Booksmith < EventScraper
4
+
5
+ def about
6
+ {
7
+ title: 'Brookline Booksmith',
8
+ url: 'http://www.brooklinebooksmith.com/events/mainevent.html',
9
+ categories: %w(books speakers),
10
+ locations: %w(coolidge-corner)
11
+ }
12
+ end
13
+
14
+ def nodes
15
+ doc.search('strong a').
16
+ select {|x|
17
+ x['href'] =~ %r{^http://www.brooklinebooksmith-shop.com/event/}}.
18
+ map {|x|
19
+ x.ancestors.detect {|y| y.name == 'tr'}}
20
+ end
21
+
22
+ def event(n)
23
+ return n.inner_html
24
+ date = if (x = n.at('.entry-meta'))
25
+ x.inner_text
26
+ else
27
+ @res.last && @res.last[:date]
28
+ end
29
+ time = (x = n.at('li/text()')) && x.text.strip
30
+ link = n.at('.entry-title a')['href']
31
+ {
32
+ date: date,
33
+ time: time,
34
+ title: n.at('.entry-title').inner_text,
35
+ link: link
36
+ }
37
+ end
38
+ end
39
+
data/boston/brattle.rb ADDED
@@ -0,0 +1,34 @@
1
+ require 'event_scraper'
2
+
3
+ class Brattle < EventScraper
4
+
5
+ def about
6
+ {
7
+ title: 'Brattle Theater Special Events',
8
+ url: 'http://brattlefilm.org/category/calendar-2/special-events/',
9
+ categories: %w(movies),
10
+ locations: %w(harvard-square)
11
+ }
12
+ end
13
+
14
+ def nodes
15
+ doc.at('#calendarframe').xpath('./div')
16
+ end
17
+
18
+ def event(n)
19
+ date = if (x = n.at('.entry-meta'))
20
+ x.inner_text
21
+ else
22
+ @res.last && @res.last[:date]
23
+ end
24
+ time = (x = n.at('li/text()')) && x.text.strip
25
+ link = n.at('.entry-title a')['href']
26
+ {
27
+ date: date,
28
+ time: time,
29
+ title: n.at('.entry-title').inner_text,
30
+ link: link
31
+ }
32
+ end
33
+ end
34
+
@@ -0,0 +1,27 @@
1
+ require 'open-uri'
2
+ module CachedHtml
3
+
4
+ CACHE_DIR = 'html_cache'
5
+ `mkdir -p #{CACHE_DIR}`
6
+
7
+ def filename(url)
8
+ File.join(CACHE_DIR, munge(url))
9
+ end
10
+
11
+ def munge(url)
12
+ url.sub("http://", '').sub(/\W$/, '').gsub('/', '.').gsub(/\W/, '-')
13
+ end
14
+
15
+ def cached_html(url)
16
+ if File.size?(filename(url))
17
+ File.read(filename(url))
18
+ else
19
+ puts "Fetching #{url}"
20
+ res = open(url)
21
+ html = res.read
22
+ File.open(filename(url), 'w') {|f| f.write html}
23
+ puts "Cached html to #{filename(url)}"
24
+ html
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,33 @@
1
+ require 'cached_html'
2
+ require 'nokogiri'
3
+
4
+ class EventScraper
5
+ include CachedHtml
6
+ attr_accessor :doc
7
+
8
+ def initialize
9
+ @doc = Nokogiri::HTML.parse(self.html)
10
+ end
11
+
12
+ def html
13
+ @html ||= cached_html(about[:url])
14
+ end
15
+
16
+ def parse
17
+ @res = []
18
+ nodes.map {|n|
19
+ @res << event(n)
20
+ }
21
+ @res
22
+ end
23
+
24
+ def parse_test
25
+ nodes.each {|n|
26
+ puts n.inner_html
27
+ puts n.inner_text.gsub(/\s{2,}/, ' ').strip
28
+ puts '-' * 80
29
+ }
30
+ end
31
+
32
+ end
33
+
data/lib/runner.rb ADDED
@@ -0,0 +1,29 @@
1
+ $LOAD_PATH.unshift('scrapers')
2
+
3
+ # TODO use opt parse to output HTML fragments for interative development
4
+
5
+
6
+ class Runner
7
+ def initialize(venue)
8
+ require venue
9
+ klass = camelize(venue)
10
+ puts klass
11
+ parser = Object.const_get(klass).new
12
+ puts parser.about.inspect
13
+ res = parser.parse
14
+ puts res
15
+ end
16
+
17
+ # from active support
18
+ def camelize(lower_case_and_underscored_word, first_letter_in_uppercase = true)
19
+ if first_letter_in_uppercase
20
+ lower_case_and_underscored_word.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
21
+ else
22
+ lower_case_and_underscored_word.to_s[0].chr.downcase + camelize(lower_case_and_underscored_word)[1..-1]
23
+ end
24
+ end
25
+ end
26
+
27
+ if __FILE__ == $0
28
+ Runner.new(ARGV.first)
29
+ end
data/notes.txt ADDED
@@ -0,0 +1,9 @@
1
+
2
+ A curated simple event scraper for select Cambridge, MA places.
3
+
4
+ Plus Sinatra app.
5
+
6
+ Develop most the scrapers on the command line, then develop the Sinatra
7
+ app.
8
+
9
+
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ #require "open_events/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "open_events"
7
+ s.version = "0.0.1"
8
+ s.platform = Gem::Platform::RUBY
9
+ s.required_ruby_version = '>= 1.9.0'
10
+
11
+ s.authors = ["Daniel Choi"]
12
+ s.email = ["dhchoi@gmail.com"]
13
+ #s.homepage = "http://danielchoi.com/software/open_events.html"
14
+ s.summary = %q{Events listings web scrapers and tools}
15
+ s.description = %q{An open-source repository of events listings web scrapers and tools}
16
+
17
+ s.rubyforge_project = "open_events"
18
+
19
+ s.files = `git ls-files`.split("\n")
20
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
21
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
22
+ s.require_paths = ["lib"]
23
+
24
+ s.add_dependency 'nokogiri'
25
+ end
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: open_events
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Daniel Choi
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-04-18 00:00:00 -04:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: nokogiri
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: "0"
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ description: An open-source repository of events listings web scrapers and tools
28
+ email:
29
+ - dhchoi@gmail.com
30
+ executables: []
31
+
32
+ extensions: []
33
+
34
+ extra_rdoc_files: []
35
+
36
+ files:
37
+ - .gitignore
38
+ - README
39
+ - Rakefile
40
+ - boston/booksmith.rb
41
+ - boston/brattle.rb
42
+ - lib/cached_html.rb
43
+ - lib/event_scraper.rb
44
+ - lib/runner.rb
45
+ - notes.txt
46
+ - open_events.gemspec
47
+ has_rdoc: true
48
+ homepage:
49
+ licenses: []
50
+
51
+ post_install_message:
52
+ rdoc_options: []
53
+
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: 1.9.0
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "0"
68
+ requirements: []
69
+
70
+ rubyforge_project: open_events
71
+ rubygems_version: 1.6.1
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Events listings web scrapers and tools
75
+ test_files: []
76
+