clasrip 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/clasrip +25 -29
- data/clasrip.gemspec +2 -2
- data/lib/clasrip.rb +14 -1
- metadata +15 -15
data/bin/clasrip
CHANGED
@@ -1,37 +1,33 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
$:.push("./lib")
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
require "date"
|
3
|
+
require "clasrip"
|
4
|
+
require "clasrip/sql"
|
5
|
+
require "date"
|
8
6
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
last_record = Clasrip::SQL::Classification.last
|
14
|
-
if last_record != nil
|
15
|
-
last_record = last_record.attributes
|
16
|
-
last_record.delete(:id)
|
17
|
-
last_record[:date_of_classification] = last_record[:date_of_classification].iso8601
|
18
|
-
|
19
|
-
date = Clasrip::SQL::Classification.last.date_of_classification
|
20
|
-
scraper.set_date(date.year, date.month-1, 1)
|
21
|
-
#puts "Set date to: #{scraper.get_date}"
|
7
|
+
if ARGV.size >= 2 and ARGV[0] == "rip"
|
8
|
+
scraper = Clasrip::Scraper.new(1971, Date.today.year + 1)
|
9
|
+
sql = Clasrip::SQL.new(ARGV[1])
|
22
10
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
11
|
+
last_record = Clasrip::SQL::Classification.last
|
12
|
+
if last_record != nil
|
13
|
+
last_record = last_record.attributes
|
14
|
+
last_record.delete(:id)
|
15
|
+
last_record[:date_of_classification] = last_record[:date_of_classification].iso8601
|
16
|
+
|
17
|
+
date = Clasrip::SQL::Classification.last.date_of_classification
|
18
|
+
scraper.set_date(date.year, date.month-1, 1)
|
29
19
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
20
|
+
print "Finding last record (#{last_record[:title]})... "
|
21
|
+
scraper.each do |record|
|
22
|
+
break if record == last_record
|
23
|
+
end
|
24
|
+
puts "Found!"
|
25
|
+
end
|
26
|
+
|
27
|
+
date = scraper.get_date
|
28
|
+
scraper.each do |record|
|
29
|
+
puts "(#{record[:date_of_classification]}) #{record[:title]} [#{record[:classification]}]"
|
30
|
+
sql.add_record(record)
|
35
31
|
end
|
36
32
|
end
|
37
33
|
|
data/clasrip.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "clasrip"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Brendan Molloy"]
|
12
|
-
s.date = "2012-02-
|
12
|
+
s.date = "2012-02-26"
|
13
13
|
s.description = "A scraper for classification.gov.au"
|
14
14
|
s.email = "brendan@bbqsrc.net"
|
15
15
|
s.executables = ["clasrip"]
|
data/lib/clasrip.rb
CHANGED
@@ -5,7 +5,7 @@ module Clasrip
|
|
5
5
|
module Version
|
6
6
|
MAJOR = 0
|
7
7
|
MINOR = 1
|
8
|
-
PATCH =
|
8
|
+
PATCH = 1
|
9
9
|
BUILD = nil
|
10
10
|
def self.to_s
|
11
11
|
[MAJOR, MINOR, PATCH, BUILD].compact.join('.')
|
@@ -111,6 +111,16 @@ module Clasrip
|
|
111
111
|
end
|
112
112
|
end
|
113
113
|
|
114
|
+
def ensure_correct_encoding(s)
|
115
|
+
s.force_encoding("utf-8")
|
116
|
+
return s if s.valid_encoding?
|
117
|
+
|
118
|
+
puts ("Invalid: " + s)
|
119
|
+
s.encode!("utf-8", "iso-8859-1")
|
120
|
+
raise "Could not enforce UTF-8 encoding: '#{s}'" unless s.valid_encoding?
|
121
|
+
s
|
122
|
+
end
|
123
|
+
|
114
124
|
def new_enum
|
115
125
|
@records = Enumerator.new do |y|
|
116
126
|
@dates[0].each do |first_date|
|
@@ -133,6 +143,9 @@ module Clasrip
|
|
133
143
|
parse_table(table).each do |record|
|
134
144
|
form = get_classification(record[:original_url]) or next
|
135
145
|
record.merge!(parse_classification(form))
|
146
|
+
record.each_pair do |k,v|
|
147
|
+
record[k] = ensure_correct_encoding(v)
|
148
|
+
end
|
136
149
|
y << record
|
137
150
|
end
|
138
151
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: clasrip
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &70149198193640 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.5.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70149198193640
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: data_mapper
|
27
|
-
requirement: &
|
27
|
+
requirement: &70149198192960 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.2.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70149198192960
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: shoulda
|
38
|
-
requirement: &
|
38
|
+
requirement: &70149198192360 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70149198192360
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rdoc
|
49
|
-
requirement: &
|
49
|
+
requirement: &70149198191660 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '3.12'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70149198191660
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: bundler
|
60
|
-
requirement: &
|
60
|
+
requirement: &70149198190640 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: 1.0.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70149198190640
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: jeweler
|
71
|
-
requirement: &
|
71
|
+
requirement: &70149198206040 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,7 +76,7 @@ dependencies:
|
|
76
76
|
version: 1.8.3
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70149198206040
|
80
80
|
description: A scraper for classification.gov.au
|
81
81
|
email: brendan@bbqsrc.net
|
82
82
|
executables:
|
@@ -112,7 +112,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
112
112
|
version: '0'
|
113
113
|
segments:
|
114
114
|
- 0
|
115
|
-
hash: -
|
115
|
+
hash: -1633970238896182400
|
116
116
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
117
|
none: false
|
118
118
|
requirements:
|