open_events 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ html_cache/
data/README ADDED
@@ -0,0 +1,3 @@
1
+ # Open Events
2
+
3
+ Description coming
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'bundler'
4
+ Bundler::GemHelper.install_tasks
5
+
6
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), 'lib')
7
+
@@ -0,0 +1,39 @@
1
+ require 'event_scraper'
2
+
3
+ class Booksmith < EventScraper
4
+
5
+ def about
6
+ {
7
+ title: 'Brookline Booksmith',
8
+ url: 'http://www.brooklinebooksmith.com/events/mainevent.html',
9
+ categories: %w(books speakers),
10
+ locations: %w(coolidge-corner)
11
+ }
12
+ end
13
+
14
+ def nodes
15
+ doc.search('strong a').
16
+ select {|x|
17
+ x['href'] =~ %r{^http://www.brooklinebooksmith-shop.com/event/}}.
18
+ map {|x|
19
+ x.ancestors.detect {|y| y.name == 'tr'}}
20
+ end
21
+
22
+ def event(n)
23
+ return n.inner_html
24
+ date = if (x = n.at('.entry-meta'))
25
+ x.inner_text
26
+ else
27
+ @res.last && @res.last[:date]
28
+ end
29
+ time = (x = n.at('li/text()')) && x.text.strip
30
+ link = n.at('.entry-title a')['href']
31
+ {
32
+ date: date,
33
+ time: time,
34
+ title: n.at('.entry-title').inner_text,
35
+ link: link
36
+ }
37
+ end
38
+ end
39
+
data/boston/brattle.rb ADDED
@@ -0,0 +1,34 @@
1
+ require 'event_scraper'
2
+
3
+ class Brattle < EventScraper
4
+
5
+ def about
6
+ {
7
+ title: 'Brattle Theater Special Events',
8
+ url: 'http://brattlefilm.org/category/calendar-2/special-events/',
9
+ categories: %w(movies),
10
+ locations: %w(harvard-square)
11
+ }
12
+ end
13
+
14
+ def nodes
15
+ doc.at('#calendarframe').xpath('./div')
16
+ end
17
+
18
+ def event(n)
19
+ date = if (x = n.at('.entry-meta'))
20
+ x.inner_text
21
+ else
22
+ @res.last && @res.last[:date]
23
+ end
24
+ time = (x = n.at('li/text()')) && x.text.strip
25
+ link = n.at('.entry-title a')['href']
26
+ {
27
+ date: date,
28
+ time: time,
29
+ title: n.at('.entry-title').inner_text,
30
+ link: link
31
+ }
32
+ end
33
+ end
34
+
@@ -0,0 +1,27 @@
1
+ require 'open-uri'
2
+ module CachedHtml
3
+
4
+ CACHE_DIR = 'html_cache'
5
+ `mkdir -p #{CACHE_DIR}`
6
+
7
+ def filename(url)
8
+ File.join(CACHE_DIR, munge(url))
9
+ end
10
+
11
+ def munge(url)
12
+ url.sub("http://", '').sub(/\W$/, '').gsub('/', '.').gsub(/\W/, '-')
13
+ end
14
+
15
+ def cached_html(url)
16
+ if File.size?(filename(url))
17
+ File.read(filename(url))
18
+ else
19
+ puts "Fetching #{url}"
20
+ res = open(url)
21
+ html = res.read
22
+ File.open(filename(url), 'w') {|f| f.write html}
23
+ puts "Cached html to #{filename(url)}"
24
+ html
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,33 @@
1
+ require 'cached_html'
2
+ require 'nokogiri'
3
+
4
+ class EventScraper
5
+ include CachedHtml
6
+ attr_accessor :doc
7
+
8
+ def initialize
9
+ @doc = Nokogiri::HTML.parse(self.html)
10
+ end
11
+
12
+ def html
13
+ @html ||= cached_html(about[:url])
14
+ end
15
+
16
+ def parse
17
+ @res = []
18
+ nodes.map {|n|
19
+ @res << event(n)
20
+ }
21
+ @res
22
+ end
23
+
24
+ def parse_test
25
+ nodes.each {|n|
26
+ puts n.inner_html
27
+ puts n.inner_text.gsub(/\s{2,}/, ' ').strip
28
+ puts '-' * 80
29
+ }
30
+ end
31
+
32
+ end
33
+
data/lib/runner.rb ADDED
@@ -0,0 +1,29 @@
1
+ $LOAD_PATH.unshift('scrapers')
2
+
3
+ # TODO use opt parse to output HTML fragments for interative development
4
+
5
+
6
+ class Runner
7
+ def initialize(venue)
8
+ require venue
9
+ klass = camelize(venue)
10
+ puts klass
11
+ parser = Object.const_get(klass).new
12
+ puts parser.about.inspect
13
+ res = parser.parse
14
+ puts res
15
+ end
16
+
17
+ # from active support
18
+ def camelize(lower_case_and_underscored_word, first_letter_in_uppercase = true)
19
+ if first_letter_in_uppercase
20
+ lower_case_and_underscored_word.to_s.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
21
+ else
22
+ lower_case_and_underscored_word.to_s[0].chr.downcase + camelize(lower_case_and_underscored_word)[1..-1]
23
+ end
24
+ end
25
+ end
26
+
27
+ if __FILE__ == $0
28
+ Runner.new(ARGV.first)
29
+ end
data/notes.txt ADDED
@@ -0,0 +1,9 @@
1
+
2
+ A curated simple event scraper for select Cambridge, MA places.
3
+
4
+ Plus Sinatra app.
5
+
6
+ Develop most the scrapers on the command line, then develop the Sinatra
7
+ app.
8
+
9
+
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ #require "open_events/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "open_events"
7
+ s.version = "0.0.1"
8
+ s.platform = Gem::Platform::RUBY
9
+ s.required_ruby_version = '>= 1.9.0'
10
+
11
+ s.authors = ["Daniel Choi"]
12
+ s.email = ["dhchoi@gmail.com"]
13
+ #s.homepage = "http://danielchoi.com/software/open_events.html"
14
+ s.summary = %q{Events listings web scrapers and tools}
15
+ s.description = %q{An open-source repository of events listings web scrapers and tools}
16
+
17
+ s.rubyforge_project = "open_events"
18
+
19
+ s.files = `git ls-files`.split("\n")
20
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
21
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
22
+ s.require_paths = ["lib"]
23
+
24
+ s.add_dependency 'nokogiri'
25
+ end
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: open_events
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Daniel Choi
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-04-18 00:00:00 -04:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: nokogiri
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: "0"
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ description: An open-source repository of events listings web scrapers and tools
28
+ email:
29
+ - dhchoi@gmail.com
30
+ executables: []
31
+
32
+ extensions: []
33
+
34
+ extra_rdoc_files: []
35
+
36
+ files:
37
+ - .gitignore
38
+ - README
39
+ - Rakefile
40
+ - boston/booksmith.rb
41
+ - boston/brattle.rb
42
+ - lib/cached_html.rb
43
+ - lib/event_scraper.rb
44
+ - lib/runner.rb
45
+ - notes.txt
46
+ - open_events.gemspec
47
+ has_rdoc: true
48
+ homepage:
49
+ licenses: []
50
+
51
+ post_install_message:
52
+ rdoc_options: []
53
+
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: 1.9.0
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "0"
68
+ requirements: []
69
+
70
+ rubyforge_project: open_events
71
+ rubygems_version: 1.6.1
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Events listings web scrapers and tools
75
+ test_files: []
76
+