clasrip 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +122 -0
- data/README.rdoc +9 -0
- data/Rakefile +56 -0
- data/bin/clasrip +37 -0
- data/clasrip.gemspec +66 -0
- data/lib/clasrip/sql.rb +120 -0
- data/lib/clasrip.rb +225 -0
- data/test/helper.rb +18 -0
- data/test/test_clasrip.rb +6 -0
- metadata +128 -0
data/.document
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
Creative Commons Legal Code
|
2
|
+
|
3
|
+
CC0 1.0 Universal
|
4
|
+
|
5
|
+
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
|
6
|
+
LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
|
7
|
+
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
|
8
|
+
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
|
9
|
+
REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
|
10
|
+
PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
|
11
|
+
THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
|
12
|
+
HEREUNDER.
|
13
|
+
|
14
|
+
Statement of Purpose
|
15
|
+
|
16
|
+
The laws of most jurisdictions throughout the world automatically confer
|
17
|
+
exclusive Copyright and Related Rights (defined below) upon the creator
|
18
|
+
and subsequent owner(s) (each and all, an "owner") of an original work of
|
19
|
+
authorship and/or a database (each, a "Work").
|
20
|
+
|
21
|
+
Certain owners wish to permanently relinquish those rights to a Work for
|
22
|
+
the purpose of contributing to a commons of creative, cultural and
|
23
|
+
scientific works ("Commons") that the public can reliably and without fear
|
24
|
+
of later claims of infringement build upon, modify, incorporate in other
|
25
|
+
works, reuse and redistribute as freely as possible in any form whatsoever
|
26
|
+
and for any purposes, including without limitation commercial purposes.
|
27
|
+
These owners may contribute to the Commons to promote the ideal of a free
|
28
|
+
culture and the further production of creative, cultural and scientific
|
29
|
+
works, or to gain reputation or greater distribution for their Work in
|
30
|
+
part through the use and efforts of others.
|
31
|
+
|
32
|
+
For these and/or other purposes and motivations, and without any
|
33
|
+
expectation of additional consideration or compensation, the person
|
34
|
+
associating CC0 with a Work (the "Affirmer"), to the extent that he or she
|
35
|
+
is an owner of Copyright and Related Rights in the Work, voluntarily
|
36
|
+
elects to apply CC0 to the Work and publicly distribute the Work under its
|
37
|
+
terms, with knowledge of his or her Copyright and Related Rights in the
|
38
|
+
Work and the meaning and intended legal effect of CC0 on those rights.
|
39
|
+
|
40
|
+
1. Copyright and Related Rights. A Work made available under CC0 may be
|
41
|
+
protected by copyright and related or neighboring rights ("Copyright and
|
42
|
+
Related Rights"). Copyright and Related Rights include, but are not
|
43
|
+
limited to, the following:
|
44
|
+
|
45
|
+
i. the right to reproduce, adapt, distribute, perform, display,
|
46
|
+
communicate, and translate a Work;
|
47
|
+
ii. moral rights retained by the original author(s) and/or performer(s);
|
48
|
+
iii. publicity and privacy rights pertaining to a person's image or
|
49
|
+
likeness depicted in a Work;
|
50
|
+
iv. rights protecting against unfair competition in regards to a Work,
|
51
|
+
subject to the limitations in paragraph 4(a), below;
|
52
|
+
v. rights protecting the extraction, dissemination, use and reuse of data
|
53
|
+
in a Work;
|
54
|
+
vi. database rights (such as those arising under Directive 96/9/EC of the
|
55
|
+
European Parliament and of the Council of 11 March 1996 on the legal
|
56
|
+
protection of databases, and under any national implementation
|
57
|
+
thereof, including any amended or successor version of such
|
58
|
+
directive); and
|
59
|
+
vii. other similar, equivalent or corresponding rights throughout the
|
60
|
+
world based on applicable law or treaty, and any national
|
61
|
+
implementations thereof.
|
62
|
+
|
63
|
+
2. Waiver. To the greatest extent permitted by, but not in contravention
|
64
|
+
of, applicable law, Affirmer hereby overtly, fully, permanently,
|
65
|
+
irrevocably and unconditionally waives, abandons, and surrenders all of
|
66
|
+
Affirmer's Copyright and Related Rights and associated claims and causes
|
67
|
+
of action, whether now known or unknown (including existing as well as
|
68
|
+
future claims and causes of action), in the Work (i) in all territories
|
69
|
+
worldwide, (ii) for the maximum duration provided by applicable law or
|
70
|
+
treaty (including future time extensions), (iii) in any current or future
|
71
|
+
medium and for any number of copies, and (iv) for any purpose whatsoever,
|
72
|
+
including without limitation commercial, advertising or promotional
|
73
|
+
purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
|
74
|
+
member of the public at large and to the detriment of Affirmer's heirs and
|
75
|
+
successors, fully intending that such Waiver shall not be subject to
|
76
|
+
revocation, rescission, cancellation, termination, or any other legal or
|
77
|
+
equitable action to disrupt the quiet enjoyment of the Work by the public
|
78
|
+
as contemplated by Affirmer's express Statement of Purpose.
|
79
|
+
|
80
|
+
3. Public License Fallback. Should any part of the Waiver for any reason
|
81
|
+
be judged legally invalid or ineffective under applicable law, then the
|
82
|
+
Waiver shall be preserved to the maximum extent permitted taking into
|
83
|
+
account Affirmer's express Statement of Purpose. In addition, to the
|
84
|
+
extent the Waiver is so judged Affirmer hereby grants to each affected
|
85
|
+
person a royalty-free, non transferable, non sublicensable, non exclusive,
|
86
|
+
irrevocable and unconditional license to exercise Affirmer's Copyright and
|
87
|
+
Related Rights in the Work (i) in all territories worldwide, (ii) for the
|
88
|
+
maximum duration provided by applicable law or treaty (including future
|
89
|
+
time extensions), (iii) in any current or future medium and for any number
|
90
|
+
of copies, and (iv) for any purpose whatsoever, including without
|
91
|
+
limitation commercial, advertising or promotional purposes (the
|
92
|
+
"License"). The License shall be deemed effective as of the date CC0 was
|
93
|
+
applied by Affirmer to the Work. Should any part of the License for any
|
94
|
+
reason be judged legally invalid or ineffective under applicable law, such
|
95
|
+
partial invalidity or ineffectiveness shall not invalidate the remainder
|
96
|
+
of the License, and in such case Affirmer hereby affirms that he or she
|
97
|
+
will not (i) exercise any of his or her remaining Copyright and Related
|
98
|
+
Rights in the Work or (ii) assert any associated claims and causes of
|
99
|
+
action with respect to the Work, in either case contrary to Affirmer's
|
100
|
+
express Statement of Purpose.
|
101
|
+
|
102
|
+
4. Limitations and Disclaimers.
|
103
|
+
|
104
|
+
a. No trademark or patent rights held by Affirmer are waived, abandoned,
|
105
|
+
surrendered, licensed or otherwise affected by this document.
|
106
|
+
b. Affirmer offers the Work as-is and makes no representations or
|
107
|
+
warranties of any kind concerning the Work, express, implied,
|
108
|
+
statutory or otherwise, including without limitation warranties of
|
109
|
+
title, merchantability, fitness for a particular purpose, non
|
110
|
+
infringement, or the absence of latent or other defects, accuracy, or
|
111
|
+
the present or absence of errors, whether or not discoverable, all to
|
112
|
+
the greatest extent permissible under applicable law.
|
113
|
+
c. Affirmer disclaims responsibility for clearing rights of other persons
|
114
|
+
that may apply to the Work or any use thereof, including without
|
115
|
+
limitation any person's Copyright and Related Rights in the Work.
|
116
|
+
Further, Affirmer disclaims responsibility for obtaining any necessary
|
117
|
+
consents, permissions or other rights required for any use of the
|
118
|
+
Work.
|
119
|
+
d. Affirmer understands and acknowledges that Creative Commons is not a
|
120
|
+
party to this document and has no duty or obligation with respect to
|
121
|
+
this CC0 or use of the Work.
|
122
|
+
|
data/README.rdoc
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
require './lib/clasrip.rb'
|
16
|
+
Jeweler::Tasks.new do |gem|
|
17
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
18
|
+
gem.name = "clasrip"
|
19
|
+
gem.version = Clasrip::Version
|
20
|
+
gem.homepage = "http://github.com/bbqsrc/clasrip"
|
21
|
+
gem.license = "CC0"
|
22
|
+
gem.summary = %Q{A scraper for classification.gov.au}
|
23
|
+
gem.description = %Q{A scraper for classification.gov.au}
|
24
|
+
gem.email = "brendan@bbqsrc.net"
|
25
|
+
gem.authors = ["Brendan Molloy"]
|
26
|
+
gem.executables = ["clasrip"]
|
27
|
+
# dependencies defined in Gemfile
|
28
|
+
end
|
29
|
+
Jeweler::RubygemsDotOrgTasks.new
|
30
|
+
|
31
|
+
require 'rake/testtask'
|
32
|
+
Rake::TestTask.new(:test) do |test|
|
33
|
+
test.libs << 'lib' << 'test'
|
34
|
+
test.pattern = 'test/**/test_*.rb'
|
35
|
+
test.verbose = true
|
36
|
+
end
|
37
|
+
|
38
|
+
#require 'rcov/rcovtask'
|
39
|
+
#Rcov::RcovTask.new do |test|
|
40
|
+
# test.libs << 'test'
|
41
|
+
# test.pattern = 'test/**/test_*.rb'
|
42
|
+
# test.verbose = true
|
43
|
+
# test.rcov_opts << '--exclude "gems/*"'
|
44
|
+
#end
|
45
|
+
|
46
|
+
task :default => :test
|
47
|
+
|
48
|
+
require 'rdoc/task'
|
49
|
+
Rake::RDocTask.new do |rdoc|
|
50
|
+
version = Clasrip::Version
|
51
|
+
|
52
|
+
rdoc.rdoc_dir = 'rdoc'
|
53
|
+
rdoc.title = "clasrip #{version}"
|
54
|
+
rdoc.rdoc_files.include('README*')
|
55
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
56
|
+
end
|
data/bin/clasrip
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$:.push("./lib")
|
3
|
+
|
4
|
+
if $0 == __FILE__
|
5
|
+
require "clasrip"
|
6
|
+
require "clasrip/sql"
|
7
|
+
require "date"
|
8
|
+
|
9
|
+
if ARGV.size >= 2 and ARGV[0] == "rip"
|
10
|
+
scraper = Clasrip::Scraper.new(1971, Date.today.year + 1)
|
11
|
+
sql = Clasrip::SQL.new(ARGV[1])
|
12
|
+
|
13
|
+
last_record = Clasrip::SQL::Classification.last
|
14
|
+
if last_record != nil
|
15
|
+
last_record = last_record.attributes
|
16
|
+
last_record.delete(:id)
|
17
|
+
last_record[:date_of_classification] = last_record[:date_of_classification].iso8601
|
18
|
+
|
19
|
+
date = Clasrip::SQL::Classification.last.date_of_classification
|
20
|
+
scraper.set_date(date.year, date.month-1, 1)
|
21
|
+
#puts "Set date to: #{scraper.get_date}"
|
22
|
+
|
23
|
+
print "Finding last record (#{last_record[:title]})... "
|
24
|
+
scraper.each do |record|
|
25
|
+
break if record == last_record
|
26
|
+
end
|
27
|
+
puts "Found!"
|
28
|
+
end
|
29
|
+
|
30
|
+
date = scraper.get_date
|
31
|
+
scraper.each do |record|
|
32
|
+
puts "(#{record[:date_of_classification]}) #{record[:title]} [#{record[:classification]}]"
|
33
|
+
sql.add_record(record)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
data/clasrip.gemspec
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "clasrip"
|
8
|
+
s.version = "0.1.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Brendan Molloy"]
|
12
|
+
s.date = "2012-02-25"
|
13
|
+
s.description = "A scraper for classification.gov.au"
|
14
|
+
s.email = "brendan@bbqsrc.net"
|
15
|
+
s.executables = ["clasrip"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.rdoc"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".document",
|
22
|
+
"Gemfile",
|
23
|
+
"LICENSE.txt",
|
24
|
+
"README.rdoc",
|
25
|
+
"Rakefile",
|
26
|
+
"bin/clasrip",
|
27
|
+
"clasrip.gemspec",
|
28
|
+
"lib/clasrip.rb",
|
29
|
+
"lib/clasrip/sql.rb",
|
30
|
+
"test/helper.rb",
|
31
|
+
"test/test_clasrip.rb"
|
32
|
+
]
|
33
|
+
s.homepage = "http://github.com/bbqsrc/clasrip"
|
34
|
+
s.licenses = ["CC0"]
|
35
|
+
s.require_paths = ["lib"]
|
36
|
+
s.rubygems_version = "1.8.11"
|
37
|
+
s.summary = "A scraper for classification.gov.au"
|
38
|
+
|
39
|
+
if s.respond_to? :specification_version then
|
40
|
+
s.specification_version = 3
|
41
|
+
|
42
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
43
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.5.0"])
|
44
|
+
s.add_runtime_dependency(%q<data_mapper>, [">= 1.2.0"])
|
45
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
46
|
+
s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
|
47
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
48
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.8.3"])
|
49
|
+
else
|
50
|
+
s.add_dependency(%q<nokogiri>, [">= 1.5.0"])
|
51
|
+
s.add_dependency(%q<data_mapper>, [">= 1.2.0"])
|
52
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
53
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
54
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
55
|
+
s.add_dependency(%q<jeweler>, ["~> 1.8.3"])
|
56
|
+
end
|
57
|
+
else
|
58
|
+
s.add_dependency(%q<nokogiri>, [">= 1.5.0"])
|
59
|
+
s.add_dependency(%q<data_mapper>, [">= 1.2.0"])
|
60
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
61
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
62
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
63
|
+
s.add_dependency(%q<jeweler>, ["~> 1.8.3"])
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
data/lib/clasrip/sql.rb
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require "data_mapper"
|
3
|
+
|
4
|
+
module Clasrip
|
5
|
+
class SQL
|
6
|
+
class Classification
|
7
|
+
include DataMapper::Resource
|
8
|
+
|
9
|
+
property :id, Serial
|
10
|
+
property :title, Text
|
11
|
+
property :original_url, Text
|
12
|
+
property :classification, Text
|
13
|
+
property :consumer_advice, Text
|
14
|
+
property :category, Text
|
15
|
+
property :medium, Text
|
16
|
+
property :version, Text
|
17
|
+
property :duration, Text
|
18
|
+
property :date_of_classification, Date
|
19
|
+
property :author, Text
|
20
|
+
property :publisher, Text
|
21
|
+
property :production_company, Text
|
22
|
+
property :country_of_origin, Text
|
23
|
+
property :applicant, Text
|
24
|
+
property :file_number, Text
|
25
|
+
property :classification_number, Text
|
26
|
+
|
27
|
+
self.raise_on_save_failure = true
|
28
|
+
end
|
29
|
+
|
30
|
+
class PEGIRating
|
31
|
+
include DataMapper::Resource
|
32
|
+
|
33
|
+
property :id, Serial
|
34
|
+
property :title, Text
|
35
|
+
property :release_date, Date
|
36
|
+
property :url, Text
|
37
|
+
property :platform, Text
|
38
|
+
property :base_age_category, Text
|
39
|
+
property :violence, Boolean
|
40
|
+
property :sex, Boolean
|
41
|
+
property :drugs, Boolean
|
42
|
+
property :fear, Boolean
|
43
|
+
property :discrimination, Boolean
|
44
|
+
property :bad_language, Boolean
|
45
|
+
property :gambling, Boolean
|
46
|
+
property :pegi_online, Boolean
|
47
|
+
|
48
|
+
self.raise_on_save_failure = true
|
49
|
+
end
|
50
|
+
|
51
|
+
def initialize(sql_url)
|
52
|
+
sql_url = sql_url.sub("///", "//#{Dir.pwd}/")
|
53
|
+
#DataMapper::Logger.new($stdout, :debug)
|
54
|
+
DataMapper.setup(:default, sql_url)
|
55
|
+
Classification.auto_upgrade!
|
56
|
+
PEGIRating.auto_upgrade!
|
57
|
+
end
|
58
|
+
|
59
|
+
def add_record(record, type=:classification)
|
60
|
+
if type == :classification
|
61
|
+
Classification.create(record)
|
62
|
+
elsif type == :pegi
|
63
|
+
PEGIRating.create(record)
|
64
|
+
else
|
65
|
+
raise "type not supported"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def parse_xml(xml_fn)
|
70
|
+
print "Parsing XML... "
|
71
|
+
xml = Nokogiri::XML(xml_fn, &:noblanks)
|
72
|
+
puts "Done."
|
73
|
+
|
74
|
+
classifications = xml.css("classifications > classification")
|
75
|
+
count = 0
|
76
|
+
record = Classification.last
|
77
|
+
puts "Null record" if record == nil
|
78
|
+
wait = (record == nil) ? false : true
|
79
|
+
|
80
|
+
puts "Record: #{record.attributes[:title]}" unless record == nil
|
81
|
+
print "Finding position... " unless record == nil
|
82
|
+
classifications = classifications.drop_while do |i|
|
83
|
+
if wait == true
|
84
|
+
res = record.attributes[:classification_number] == i.at_css("classification-number").content
|
85
|
+
wait = false if res == true
|
86
|
+
if wait == false
|
87
|
+
puts "Done!"
|
88
|
+
return true
|
89
|
+
end
|
90
|
+
end
|
91
|
+
wait
|
92
|
+
end
|
93
|
+
clen = classifications.length
|
94
|
+
|
95
|
+
classifications.each do |node|
|
96
|
+
count += 1
|
97
|
+
c = {}
|
98
|
+
node.children.each do |child|
|
99
|
+
next if child.name == "text"
|
100
|
+
name = child.name.gsub("-", "_").to_sym
|
101
|
+
if name == :date_of_classification
|
102
|
+
date = Date.strptime(child.content, "%Y-%m-%d")
|
103
|
+
c[name] = date
|
104
|
+
else
|
105
|
+
c[name] = child.content
|
106
|
+
end
|
107
|
+
end
|
108
|
+
cls = Classification.create(c)
|
109
|
+
print "\r#{count}/#{clen}"
|
110
|
+
end
|
111
|
+
puts "\nDone!"
|
112
|
+
end
|
113
|
+
|
114
|
+
def to_xml(table)
|
115
|
+
#stub
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
DataMapper.finalize
|
data/lib/clasrip.rb
ADDED
@@ -0,0 +1,225 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require "net/http"
|
3
|
+
|
4
|
+
module Clasrip
|
5
|
+
module Version
|
6
|
+
MAJOR = 0
|
7
|
+
MINOR = 1
|
8
|
+
PATCH = 0
|
9
|
+
BUILD = nil
|
10
|
+
def self.to_s
|
11
|
+
[MAJOR, MINOR, PATCH, BUILD].compact.join('.')
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class DataIntegrityWarning < Exception; end
|
16
|
+
|
17
|
+
class DatesBetween
|
18
|
+
attr_accessor :year, :month, :day
|
19
|
+
|
20
|
+
def initialize(start, finish)
|
21
|
+
@day = 0
|
22
|
+
@year = start
|
23
|
+
@finish = finish
|
24
|
+
@month = 0
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
"#{@day}/#{@month}/#{@year}"
|
29
|
+
end
|
30
|
+
|
31
|
+
def next
|
32
|
+
raise StopIteration if @year >= @finish
|
33
|
+
if @day >= 1 and @day < 15
|
34
|
+
@day = 15
|
35
|
+
else
|
36
|
+
@month += 1
|
37
|
+
@day = 1
|
38
|
+
end
|
39
|
+
|
40
|
+
if @month > 12
|
41
|
+
@month = 1
|
42
|
+
@year += 1
|
43
|
+
end
|
44
|
+
|
45
|
+
to_s
|
46
|
+
end
|
47
|
+
|
48
|
+
def each
|
49
|
+
loop do
|
50
|
+
yield self.next
|
51
|
+
end
|
52
|
+
rescue StopIteration
|
53
|
+
self
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class Scraper
|
58
|
+
def initialize(start_year, end_year)
|
59
|
+
@dates = [
|
60
|
+
DatesBetween.new(start_year, end_year),
|
61
|
+
DatesBetween.new(start_year, end_year)
|
62
|
+
]
|
63
|
+
@dates[1].next
|
64
|
+
|
65
|
+
@host_url = "www.classification.gov.au"
|
66
|
+
@query_url = "/www/cob/find.nsf/classifications?search&searchwv=1&searchmax=1000&count=1000&query=(%%5BclassificationDate%%5D%%3E=%s)AND(%%5BclassificationDate%%5D%%3C%s)"
|
67
|
+
new_conn
|
68
|
+
new_enum
|
69
|
+
end
|
70
|
+
|
71
|
+
def next
|
72
|
+
@records.next
|
73
|
+
end
|
74
|
+
|
75
|
+
def each
|
76
|
+
@records.each do |r|
|
77
|
+
yield r
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def peek
|
82
|
+
@records.peek
|
83
|
+
end
|
84
|
+
|
85
|
+
def set_date(year, month, day)
|
86
|
+
@dates.each do |date|
|
87
|
+
date.year = year.to_i
|
88
|
+
date.month = month.to_i
|
89
|
+
date.day = day.to_i
|
90
|
+
end
|
91
|
+
@dates[1].next
|
92
|
+
end
|
93
|
+
|
94
|
+
def get_date
|
95
|
+
@dates[0]
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
def new_conn
|
100
|
+
@conn = Net::HTTP.new(@host_url, 80)
|
101
|
+
@conn.read_timeout = 3
|
102
|
+
@conn.start
|
103
|
+
end
|
104
|
+
|
105
|
+
def get_conn(arg)
|
106
|
+
begin
|
107
|
+
return @conn.get(arg)
|
108
|
+
rescue
|
109
|
+
new_conn
|
110
|
+
retry
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def new_enum
|
115
|
+
@records = Enumerator.new do |y|
|
116
|
+
@dates[0].each do |first_date|
|
117
|
+
second_date = @dates[1].next
|
118
|
+
|
119
|
+
tables = []
|
120
|
+
begin
|
121
|
+
t = get_table or next
|
122
|
+
check_result_count
|
123
|
+
tables.push(t)
|
124
|
+
rescue DataIntegrityWarning
|
125
|
+
tables_by_rating do |table|
|
126
|
+
next if table == nil
|
127
|
+
check_result_count
|
128
|
+
tables.push(table)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
tables.each do |table|
|
133
|
+
parse_table(table).each do |record|
|
134
|
+
form = get_classification(record[:original_url]) or next
|
135
|
+
record.merge!(parse_classification(form))
|
136
|
+
y << record
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def tables_by_rating
|
144
|
+
ratings = {
|
145
|
+
"Unrestricted" => ["Unrestricted"],
|
146
|
+
"G" => ["Likely G", "G"],
|
147
|
+
"PG" => ["Likely PG", "PG", "G 8+"],
|
148
|
+
"M" => ["Likely M", "M"],
|
149
|
+
"MA" => ["Likely MA 15+", "MA15+ Conditions", "MA 15+"],
|
150
|
+
"R" => ["Likely R 18+", "R", "R 18+"],
|
151
|
+
"X" => ["Likely X 18+", "X", "X 18+"],
|
152
|
+
"CAT1" => ["CAT 1"],
|
153
|
+
"CAT2" => ["CAT 2"],
|
154
|
+
"RC" => ["RC"],
|
155
|
+
"Misc" => ["Revoked", "Ad Approved", "Approved", "Ad Refused", "Refused"]
|
156
|
+
}
|
157
|
+
|
158
|
+
ratings.each_pair do |k,v|
|
159
|
+
q = v.map{|i| "(%5Brating=#{i.gsub(' ', '+')}%5D)"}
|
160
|
+
x = "AND(#{q.join('OR')})"
|
161
|
+
res = get_conn(@query_url % [@dates[0].to_s, @dates[1].to_s] + x)
|
162
|
+
@html = Nokogiri::HTML(res.read_body)
|
163
|
+
yield @html.at_css("#results > table")
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
def check_result_count
|
168
|
+
results = @html.at_css(".content p").content
|
169
|
+
results = results.sub(/.*of (\d+) .*/, "\\1").to_i
|
170
|
+
if results == 1000
|
171
|
+
raise DataIntegrityWarning, "1000 results detected. Some records may be missed for URL: #{@query_url % [@dates[0].to_s, @dates[1].to_s]}"
|
172
|
+
end
|
173
|
+
results
|
174
|
+
end
|
175
|
+
|
176
|
+
def get_table
|
177
|
+
res = get_conn(@query_url % [@dates[0].to_s, @dates[1].to_s])
|
178
|
+
@html = Nokogiri::HTML(res.read_body)
|
179
|
+
return @html.at_css("#results > table")
|
180
|
+
end
|
181
|
+
|
182
|
+
def parse_table(table)
|
183
|
+
records = []
|
184
|
+
table.xpath("tr").each do |row|
|
185
|
+
row.children[0].node_name == "td" or next
|
186
|
+
record = {}
|
187
|
+
|
188
|
+
record[:title] = row.xpath('td[2]/a').first.content
|
189
|
+
record[:original_url] = row.xpath('td[2]/a').first['href'].split('/').last.split('?').first
|
190
|
+
|
191
|
+
records.push(record)
|
192
|
+
end
|
193
|
+
return records
|
194
|
+
end
|
195
|
+
|
196
|
+
def get_classification(url)
|
197
|
+
res = get_conn("/www/cob/find.nsf/d853f429dd038ae1ca25759b0003557c/#{url}")
|
198
|
+
@html = Nokogiri::HTML(res.read_body)
|
199
|
+
@html.at_css(".fform")
|
200
|
+
end
|
201
|
+
|
202
|
+
def parse_classification(form)
|
203
|
+
record = {}
|
204
|
+
form.css(".frow").each do |row|
|
205
|
+
label = row.at_css(".flabel").content.strip.downcase.gsub(" ", "_").to_sym
|
206
|
+
field = row.at_css(".ffield").content
|
207
|
+
|
208
|
+
field = field.encode("UTF-8") unless field.valid_encoding?
|
209
|
+
field = field.strip.gsub("\u00A0", "") if field.valid_encoding?
|
210
|
+
|
211
|
+
if label == :date_of_classification
|
212
|
+
date = field.split('/')
|
213
|
+
field = "#{date[2]}-#{date[0]}-#{date[1]}"
|
214
|
+
elsif label == :version
|
215
|
+
fld = @html.at_css(".content p").children[1]
|
216
|
+
record[:medium] = fld.content.sub(/.*\((.*?)\)/, "\\1") if fld != nil
|
217
|
+
record[:medium] = "" if fld == nil
|
218
|
+
end
|
219
|
+
|
220
|
+
record[label] = field
|
221
|
+
end
|
222
|
+
record
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'clasrip'
|
16
|
+
|
17
|
+
class Test::Unit::TestCase
|
18
|
+
end
|
metadata
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: clasrip
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Brendan Molloy
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-02-25 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: &70095153397460 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.5.0
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70095153397460
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: data_mapper
|
27
|
+
requirement: &70095153396980 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.2.0
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70095153396980
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: shoulda
|
38
|
+
requirement: &70095153396500 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70095153396500
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rdoc
|
49
|
+
requirement: &70095153395980 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.12'
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *70095153395980
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: bundler
|
60
|
+
requirement: &70095153395480 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ~>
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: 1.0.0
|
66
|
+
type: :development
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *70095153395480
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: jeweler
|
71
|
+
requirement: &70095153394980 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ~>
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: 1.8.3
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: *70095153394980
|
80
|
+
description: A scraper for classification.gov.au
|
81
|
+
email: brendan@bbqsrc.net
|
82
|
+
executables:
|
83
|
+
- clasrip
|
84
|
+
extensions: []
|
85
|
+
extra_rdoc_files:
|
86
|
+
- LICENSE.txt
|
87
|
+
- README.rdoc
|
88
|
+
files:
|
89
|
+
- .document
|
90
|
+
- Gemfile
|
91
|
+
- LICENSE.txt
|
92
|
+
- README.rdoc
|
93
|
+
- Rakefile
|
94
|
+
- bin/clasrip
|
95
|
+
- clasrip.gemspec
|
96
|
+
- lib/clasrip.rb
|
97
|
+
- lib/clasrip/sql.rb
|
98
|
+
- test/helper.rb
|
99
|
+
- test/test_clasrip.rb
|
100
|
+
homepage: http://github.com/bbqsrc/clasrip
|
101
|
+
licenses:
|
102
|
+
- CC0
|
103
|
+
post_install_message:
|
104
|
+
rdoc_options: []
|
105
|
+
require_paths:
|
106
|
+
- lib
|
107
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
108
|
+
none: false
|
109
|
+
requirements:
|
110
|
+
- - ! '>='
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
version: '0'
|
113
|
+
segments:
|
114
|
+
- 0
|
115
|
+
hash: -3829822103400611157
|
116
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
|
+
none: false
|
118
|
+
requirements:
|
119
|
+
- - ! '>='
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
version: '0'
|
122
|
+
requirements: []
|
123
|
+
rubyforge_project:
|
124
|
+
rubygems_version: 1.8.11
|
125
|
+
signing_key:
|
126
|
+
specification_version: 3
|
127
|
+
summary: A scraper for classification.gov.au
|
128
|
+
test_files: []
|