clasrip 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/clasrip +25 -29
- data/clasrip.gemspec +2 -2
- data/lib/clasrip.rb +14 -1
- metadata +15 -15
data/bin/clasrip
CHANGED
@@ -1,37 +1,33 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
$:.push("./lib")
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
require "date"
|
3
|
+
require "clasrip"
|
4
|
+
require "clasrip/sql"
|
5
|
+
require "date"
|
8
6
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
last_record = Clasrip::SQL::Classification.last
|
14
|
-
if last_record != nil
|
15
|
-
last_record = last_record.attributes
|
16
|
-
last_record.delete(:id)
|
17
|
-
last_record[:date_of_classification] = last_record[:date_of_classification].iso8601
|
18
|
-
|
19
|
-
date = Clasrip::SQL::Classification.last.date_of_classification
|
20
|
-
scraper.set_date(date.year, date.month-1, 1)
|
21
|
-
#puts "Set date to: #{scraper.get_date}"
|
7
|
+
if ARGV.size >= 2 and ARGV[0] == "rip"
|
8
|
+
scraper = Clasrip::Scraper.new(1971, Date.today.year + 1)
|
9
|
+
sql = Clasrip::SQL.new(ARGV[1])
|
22
10
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
11
|
+
last_record = Clasrip::SQL::Classification.last
|
12
|
+
if last_record != nil
|
13
|
+
last_record = last_record.attributes
|
14
|
+
last_record.delete(:id)
|
15
|
+
last_record[:date_of_classification] = last_record[:date_of_classification].iso8601
|
16
|
+
|
17
|
+
date = Clasrip::SQL::Classification.last.date_of_classification
|
18
|
+
scraper.set_date(date.year, date.month-1, 1)
|
29
19
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
20
|
+
print "Finding last record (#{last_record[:title]})... "
|
21
|
+
scraper.each do |record|
|
22
|
+
break if record == last_record
|
23
|
+
end
|
24
|
+
puts "Found!"
|
25
|
+
end
|
26
|
+
|
27
|
+
date = scraper.get_date
|
28
|
+
scraper.each do |record|
|
29
|
+
puts "(#{record[:date_of_classification]}) #{record[:title]} [#{record[:classification]}]"
|
30
|
+
sql.add_record(record)
|
35
31
|
end
|
36
32
|
end
|
37
33
|
|
data/clasrip.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "clasrip"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Brendan Molloy"]
|
12
|
-
s.date = "2012-02-
|
12
|
+
s.date = "2012-02-26"
|
13
13
|
s.description = "A scraper for classification.gov.au"
|
14
14
|
s.email = "brendan@bbqsrc.net"
|
15
15
|
s.executables = ["clasrip"]
|
data/lib/clasrip.rb
CHANGED
@@ -5,7 +5,7 @@ module Clasrip
|
|
5
5
|
module Version
|
6
6
|
MAJOR = 0
|
7
7
|
MINOR = 1
|
8
|
-
PATCH =
|
8
|
+
PATCH = 1
|
9
9
|
BUILD = nil
|
10
10
|
def self.to_s
|
11
11
|
[MAJOR, MINOR, PATCH, BUILD].compact.join('.')
|
@@ -111,6 +111,16 @@ module Clasrip
|
|
111
111
|
end
|
112
112
|
end
|
113
113
|
|
114
|
+
def ensure_correct_encoding(s)
|
115
|
+
s.force_encoding("utf-8")
|
116
|
+
return s if s.valid_encoding?
|
117
|
+
|
118
|
+
puts ("Invalid: " + s)
|
119
|
+
s.encode!("utf-8", "iso-8859-1")
|
120
|
+
raise "Could not enforce UTF-8 encoding: '#{s}'" unless s.valid_encoding?
|
121
|
+
s
|
122
|
+
end
|
123
|
+
|
114
124
|
def new_enum
|
115
125
|
@records = Enumerator.new do |y|
|
116
126
|
@dates[0].each do |first_date|
|
@@ -133,6 +143,9 @@ module Clasrip
|
|
133
143
|
parse_table(table).each do |record|
|
134
144
|
form = get_classification(record[:original_url]) or next
|
135
145
|
record.merge!(parse_classification(form))
|
146
|
+
record.each_pair do |k,v|
|
147
|
+
record[k] = ensure_correct_encoding(v)
|
148
|
+
end
|
136
149
|
y << record
|
137
150
|
end
|
138
151
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: clasrip
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &70149198193640 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.5.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70149198193640
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: data_mapper
|
27
|
-
requirement: &
|
27
|
+
requirement: &70149198192960 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.2.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70149198192960
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: shoulda
|
38
|
-
requirement: &
|
38
|
+
requirement: &70149198192360 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70149198192360
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rdoc
|
49
|
-
requirement: &
|
49
|
+
requirement: &70149198191660 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '3.12'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70149198191660
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: bundler
|
60
|
-
requirement: &
|
60
|
+
requirement: &70149198190640 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: 1.0.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70149198190640
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: jeweler
|
71
|
-
requirement: &
|
71
|
+
requirement: &70149198206040 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,7 +76,7 @@ dependencies:
|
|
76
76
|
version: 1.8.3
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70149198206040
|
80
80
|
description: A scraper for classification.gov.au
|
81
81
|
email: brendan@bbqsrc.net
|
82
82
|
executables:
|
@@ -112,7 +112,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
112
112
|
version: '0'
|
113
113
|
segments:
|
114
114
|
- 0
|
115
|
-
hash: -
|
115
|
+
hash: -1633970238896182400
|
116
116
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
117
|
none: false
|
118
118
|
requirements:
|