imw 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +15 -0
- data/CHANGELOG +0 -0
- data/LICENSE +674 -0
- data/README.rdoc +101 -0
- data/Rakefile +20 -0
- data/VERSION +1 -0
- data/etc/imwrc.rb +76 -0
- data/lib/imw.rb +42 -0
- data/lib/imw/boot.rb +58 -0
- data/lib/imw/dataset.rb +233 -0
- data/lib/imw/dataset/datamapper.rb +66 -0
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
- data/lib/imw/dataset/loaddump.rb +50 -0
- data/lib/imw/dataset/old/file_collection.rb +88 -0
- data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
- data/lib/imw/dataset/scaffold.rb +132 -0
- data/lib/imw/dataset/scraped_uri.rb +305 -0
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
- data/lib/imw/dataset/scrub/scrub.rb +147 -0
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
- data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
- data/lib/imw/dataset/scrub/slug.rb +101 -0
- data/lib/imw/dataset/stats.rb +73 -0
- data/lib/imw/dataset/stats/counter.rb +23 -0
- data/lib/imw/dataset/task.rb +38 -0
- data/lib/imw/dataset/workflow.rb +81 -0
- data/lib/imw/files.rb +110 -0
- data/lib/imw/files/archive.rb +113 -0
- data/lib/imw/files/basicfile.rb +122 -0
- data/lib/imw/files/binary.rb +28 -0
- data/lib/imw/files/compressed_file.rb +93 -0
- data/lib/imw/files/compressed_files_and_archives.rb +348 -0
- data/lib/imw/files/compressible.rb +103 -0
- data/lib/imw/files/csv.rb +112 -0
- data/lib/imw/files/json.rb +41 -0
- data/lib/imw/files/sgml.rb +65 -0
- data/lib/imw/files/text.rb +68 -0
- data/lib/imw/files/yaml.rb +46 -0
- data/lib/imw/packagers.rb +8 -0
- data/lib/imw/packagers/archiver.rb +108 -0
- data/lib/imw/packagers/s3_mover.rb +28 -0
- data/lib/imw/parsers.rb +7 -0
- data/lib/imw/parsers/html_parser.rb +382 -0
- data/lib/imw/parsers/html_parser/matchers.rb +306 -0
- data/lib/imw/parsers/line_parser.rb +87 -0
- data/lib/imw/parsers/regexp_parser.rb +72 -0
- data/lib/imw/utils.rb +24 -0
- data/lib/imw/utils/components.rb +61 -0
- data/lib/imw/utils/config.rb +46 -0
- data/lib/imw/utils/error.rb +54 -0
- data/lib/imw/utils/extensions/array.rb +125 -0
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
- data/lib/imw/utils/extensions/core.rb +43 -0
- data/lib/imw/utils/extensions/dir.rb +24 -0
- data/lib/imw/utils/extensions/file_core.rb +64 -0
- data/lib/imw/utils/extensions/hash.rb +218 -0
- data/lib/imw/utils/extensions/hpricot.rb +48 -0
- data/lib/imw/utils/extensions/string.rb +49 -0
- data/lib/imw/utils/extensions/struct.rb +42 -0
- data/lib/imw/utils/extensions/symbol.rb +28 -0
- data/lib/imw/utils/extensions/typed_struct.rb +22 -0
- data/lib/imw/utils/extensions/uri.rb +59 -0
- data/lib/imw/utils/log.rb +67 -0
- data/lib/imw/utils/misc.rb +63 -0
- data/lib/imw/utils/paths.rb +115 -0
- data/lib/imw/utils/uri.rb +59 -0
- data/lib/imw/utils/uuid.rb +33 -0
- data/lib/imw/utils/validate.rb +38 -0
- data/lib/imw/utils/version.rb +12 -0
- data/lib/imw/utils/view.rb +113 -0
- data/lib/imw/utils/view/dump_csv.rb +112 -0
- data/lib/imw/utils/view/dump_csv_older.rb +117 -0
- data/spec/data/sample.csv +131 -0
- data/spec/data/sample.tsv +131 -0
- data/spec/data/sample.txt +131 -0
- data/spec/data/sample.xml +653 -0
- data/spec/data/sample.yaml +652 -0
- data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
- data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
- data/spec/imw/files/archive_spec.rb +118 -0
- data/spec/imw/files/basicfile_spec.rb +121 -0
- data/spec/imw/files/bz2_spec.rb +32 -0
- data/spec/imw/files/compressed_file_spec.rb +96 -0
- data/spec/imw/files/compressible_spec.rb +100 -0
- data/spec/imw/files/file_spec.rb +144 -0
- data/spec/imw/files/gz_spec.rb +32 -0
- data/spec/imw/files/rar_spec.rb +33 -0
- data/spec/imw/files/tar_spec.rb +31 -0
- data/spec/imw/files/text_spec.rb +23 -0
- data/spec/imw/files/zip_spec.rb +31 -0
- data/spec/imw/files_spec.rb +38 -0
- data/spec/imw/packagers/archiver_spec.rb +125 -0
- data/spec/imw/packagers/s3_mover_spec.rb +7 -0
- data/spec/imw/parsers/line_parser_spec.rb +96 -0
- data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
- data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
- data/spec/imw/utils/extensions/find_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +38 -0
- data/spec/imw/workflow/rip/local_spec.rb +89 -0
- data/spec/imw/workflow/rip_spec.rb +27 -0
- data/spec/rcov.opts +1 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +32 -0
- data/spec/support/archive_contents_matcher.rb +94 -0
- data/spec/support/custom_matchers.rb +21 -0
- data/spec/support/directory_contents_matcher.rb +61 -0
- data/spec/support/extensions.rb +18 -0
- data/spec/support/file_contents_matcher.rb +50 -0
- data/spec/support/random.rb +210 -0
- data/spec/support/without_regard_to_order_matcher.rb +58 -0
- metadata +196 -0
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module IMW
|
4
|
+
LOG_FILE_DESTINATION = STDERR unless defined?(LOG_FILE_DESTINATION)
|
5
|
+
LOG_TIMEFORMAT = "%Y%m%d-%H:%M:%S " unless defined?(LOG_TIMEFORMAT)
|
6
|
+
|
7
|
+
class << self; attr_accessor :log end
|
8
|
+
#
|
9
|
+
# Create a Logger and point it at LOG_FILE_DESTINATION
|
10
|
+
#
|
11
|
+
# LOG_FILE_DESTINATION is STDOUT by default; redefine it in your
|
12
|
+
# ~/.imwrc, or set IMW.log yourself, if that's not cool.
|
13
|
+
#
|
14
|
+
def self.instantiate_logger!
|
15
|
+
IMW.log ||= Logger.new(LOG_FILE_DESTINATION)
|
16
|
+
IMW.log.datetime_format = "%Y%m%d-%H:%M:%S "
|
17
|
+
IMW.log.level = Logger::INFO
|
18
|
+
end
|
19
|
+
|
20
|
+
def announce *events
|
21
|
+
options = events.extract_options!
|
22
|
+
options.reverse_merge! :level => Logger::INFO
|
23
|
+
# puts [options, events ].inspect, "*"*76
|
24
|
+
IMW.log.add options[:level], events.join("\n")
|
25
|
+
end
|
26
|
+
def banner *events
|
27
|
+
options = events.extract_options!
|
28
|
+
options.reverse_merge! :level => Logger::INFO
|
29
|
+
["*"*75, events, "*"*75].flatten.each{|ev| announce(ev, options) }
|
30
|
+
end
|
31
|
+
|
32
|
+
PROGRESS_TRACKERS = {}
|
33
|
+
#
|
34
|
+
# When the slowly-changing tracked variable +var+ changes value,
|
35
|
+
# announce its new value. Always announces on first call.
|
36
|
+
#
|
37
|
+
# Ex:
|
38
|
+
# track_progress :indexing_names, name[0..0] # announce at each initial letter
|
39
|
+
# track_progress :files, (i % 1000) # announce at each 1,000 iterations
|
40
|
+
#
|
41
|
+
def track_progress tracker, val
|
42
|
+
unless (IMW::PROGRESS_TRACKERS.include?(tracker)) &&
|
43
|
+
(IMW::PROGRESS_TRACKERS[tracker] == val)
|
44
|
+
announce "#{tracker.to_s.gsub(/_/,' ')}: #{val}"
|
45
|
+
IMW::PROGRESS_TRACKERS[tracker] = val
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
PROGRESS_COUNTERS = {}
|
50
|
+
#
|
51
|
+
# Log repetitions in a given context
|
52
|
+
#
|
53
|
+
# At every n'th (default 1000) call,
|
54
|
+
# announce progress in the IMW.log
|
55
|
+
#
|
56
|
+
def track_count tracker, every=1000
|
57
|
+
PROGRESS_COUNTERS[tracker] ||= 0
|
58
|
+
PROGRESS_COUNTERS[tracker] += 1
|
59
|
+
chunk = every * (PROGRESS_COUNTERS[tracker]/every).to_i
|
60
|
+
track_progress "count_of_#{tracker}", chunk
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
#
|
65
|
+
# Make the default logger
|
66
|
+
#
|
67
|
+
IMW.instantiate_logger!
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module IMW
|
2
|
+
# Return a string representing the current UTC time in the IMW
|
3
|
+
# format.
|
4
|
+
def self.current_utc_time_string
|
5
|
+
Time.now.utc.strftime(IMW::STRFTIME_FORMAT)
|
6
|
+
end
|
7
|
+
|
8
|
+
|
9
|
+
# A simple counter. The +value+ and +add+ methods read and
|
10
|
+
# increment the counter's value.
|
11
|
+
#
|
12
|
+
# counter = IMW::Counter.new
|
13
|
+
# counter.value #=> 0
|
14
|
+
# counter.add 1
|
15
|
+
# counter.value #=> 1
|
16
|
+
#
|
17
|
+
# The +next!+ method acts as like C's <tt>value++</tt>, incrementing
|
18
|
+
# +value+ _after_ it is referenced.
|
19
|
+
#
|
20
|
+
# counter = IMW::Counter.new
|
21
|
+
# counter.value #=> 0
|
22
|
+
# counter.next! #=> 0
|
23
|
+
# counter.value #=> 1
|
24
|
+
#
|
25
|
+
# Counters can also be reset
|
26
|
+
#
|
27
|
+
# counter.reset!
|
28
|
+
# counter.value #=> 0
|
29
|
+
class Counter
|
30
|
+
|
31
|
+
attr_accessor :value, :starting_value, :increment
|
32
|
+
|
33
|
+
# Return a new Counter. The first argument is the starting value
|
34
|
+
# (defaults to 0) and the second is the increment (defaults to 1).
|
35
|
+
def initialize starting_value=0,increment=1
|
36
|
+
@starting_value = starting_value
|
37
|
+
@value = starting_value
|
38
|
+
@increment = increment
|
39
|
+
end
|
40
|
+
|
41
|
+
# Add +amount+ (defaults to the value of <tt>@increment</tt>).
|
42
|
+
def add amount=nil
|
43
|
+
@value += amount || @increment
|
44
|
+
end
|
45
|
+
alias_method :add!, :add
|
46
|
+
|
47
|
+
# Increment the counter by <tt>@increment</tt> but return its
|
48
|
+
# value _before_ being incremented.
|
49
|
+
def next!
|
50
|
+
old_value = @value
|
51
|
+
@value += @increment
|
52
|
+
old_value
|
53
|
+
end
|
54
|
+
|
55
|
+
# Reset the counter to +value+ (defaults to the value of
|
56
|
+
# <tt>@starting_value</tt>).
|
57
|
+
def reset! value=nil
|
58
|
+
@value = value || @starting_value
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# puts "#{File.basename(__FILE__)}: Your Monkeywrench seems suddenly more utilisable." # at bottom
|
@@ -0,0 +1,115 @@
|
|
1
|
+
#
|
2
|
+
# h2. lib/imw/utils/paths.rb -- defines the path structure of IMW
|
3
|
+
#
|
4
|
+
# == About
|
5
|
+
#
|
6
|
+
# IMW uses lots of different directories to keep information on data
|
7
|
+
# and datasets separate. This module interfaces with the
|
8
|
+
# configuration files to establish the paths to these IMW directories
|
9
|
+
# and provides functions and mixins for IMW objects to use to access
|
10
|
+
# these paths.
|
11
|
+
#
|
12
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
13
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
14
|
+
# License:: GPL 3.0
|
15
|
+
# Website:: http://infinitemonkeywrench.org/
|
16
|
+
#
|
17
|
+
|
18
|
+
module IMW
|
19
|
+
|
20
|
+
# Implements methods designed to work with an object's
|
21
|
+
# <tt>@paths</tt> attributes, adding and deleting symbolic
|
22
|
+
# references to paths and expanding calls to +path_to+ from that
|
23
|
+
# attribute or (when a miss) from <tt>IMW::PATHS</tt>.
|
24
|
+
#
|
25
|
+
# An including class should therefore define an array attribute
|
26
|
+
# <tt>@paths</tt>.
|
27
|
+
module Paths
|
28
|
+
|
29
|
+
# Expands a shorthand workflow path specification to an
|
30
|
+
# actual file path.
|
31
|
+
#
|
32
|
+
# add_path :mlb_08, 'gd2.mlb.com/components/game/mlb/year_2008'
|
33
|
+
# path_to :ripd, :mlb_08, 'month_06', 'day_08', 'miniscoreboard.xml'
|
34
|
+
# => (...)/data/ripd/gd2.mlb.com/components/game/mlb/year_2008/month_06/day_08/miniscoreboard.xml
|
35
|
+
def path_to *pathsegs
|
36
|
+
begin
|
37
|
+
path = Pathname.new path_to_helper(*pathsegs)
|
38
|
+
path.absolute? ? File.expand_path(path) : path.to_s
|
39
|
+
rescue Exception => e
|
40
|
+
raise("Can't find path to '#{pathsegs}': #{e}");
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
def path_to_helper *pathsegs # :nodoc:
|
46
|
+
# +path_to_helper+ handles the recursive calls for +path_to+.
|
47
|
+
expanded = pathsegs.flatten.compact.map do |pathseg|
|
48
|
+
case
|
49
|
+
when pathseg.is_a?(Symbol) && @paths.include?(pathseg) then path_to(@paths[pathseg])
|
50
|
+
when pathseg.is_a?(Symbol) && IMW::PATHS.include?(pathseg) then path_to(IMW::PATHS[pathseg])
|
51
|
+
when pathseg.is_a?(Symbol) then raise IMW::PathError.new("No path expansion set for #{pathseg.inspect}")
|
52
|
+
else pathseg
|
53
|
+
end
|
54
|
+
end
|
55
|
+
File.join(*expanded)
|
56
|
+
end
|
57
|
+
public
|
58
|
+
|
59
|
+
# Adds a symbolic path for expansion by +path_to+.
|
60
|
+
def add_path sym, *pathsegs
|
61
|
+
@paths[sym] = pathsegs.flatten
|
62
|
+
end
|
63
|
+
|
64
|
+
# Removes a symbolic path for expansion by +path_to+.
|
65
|
+
def remove_path sym
|
66
|
+
@paths.delete sym if @paths.include? sym
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
class Dataset
|
71
|
+
attr_reader :paths
|
72
|
+
include IMW::Paths
|
73
|
+
|
74
|
+
private
|
75
|
+
def set_paths
|
76
|
+
@paths = {}
|
77
|
+
add_path :self, File.dirname(eval('__FILE__'))
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def self.path_to *pathsegs
|
82
|
+
begin
|
83
|
+
path = Pathname.new IMW.path_to_helper(*pathsegs)
|
84
|
+
path.absolute? ? File.expand_path(path) : path.to_s
|
85
|
+
rescue Exception => e
|
86
|
+
raise("Can't find path to '#{pathsegs}': #{e}");
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
private
|
91
|
+
def self.path_to_helper *pathsegs # :nodoc:
|
92
|
+
# +path_to_helper+ handles the recursive calls for +path_to+.
|
93
|
+
expanded = pathsegs.flatten.compact.map do |pathseg|
|
94
|
+
case
|
95
|
+
when pathseg.is_a?(Symbol) && IMW::PATHS.include?(pathseg) then path_to(IMW::PATHS[pathseg])
|
96
|
+
when pathseg.is_a?(Symbol) then raise IMW::PathError.new("No path expansion set for #{pathseg.inspect}")
|
97
|
+
else pathseg
|
98
|
+
end
|
99
|
+
end
|
100
|
+
File.join(*expanded)
|
101
|
+
end
|
102
|
+
public
|
103
|
+
|
104
|
+
# Adds a symbolic path for expansion by +path_to+.
|
105
|
+
def self.add_path sym, *pathsegs
|
106
|
+
IMW::PATHS[sym] = pathsegs.flatten
|
107
|
+
end
|
108
|
+
|
109
|
+
# Removes a symbolic path for expansion by +path_to+.
|
110
|
+
def self.remove_path sym
|
111
|
+
IMW::PATHS.delete sym if IMW::PATHS.include? sym
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# puts "#{File.basename(__FILE__)}: Your monkeywrench glows alternately dim then bright as you wander, suggesting to you which paths to take."
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'imw/utils'
|
2
|
+
require 'imw/utils/uuid'
|
3
|
+
require 'addressable/uri'
|
4
|
+
module Addressable
|
5
|
+
#
|
6
|
+
# Add the #scrubbed and #revhost calls
|
7
|
+
#
|
8
|
+
class URI
|
9
|
+
SAFE_CHARS = %r{a-zA-Z0-9\-\._!\(\)\*\'}
|
10
|
+
PATH_CHARS = %r{#{SAFE_CHARS}\$&\+,:=@\/;}
|
11
|
+
RESERVED_CHARS = %r{\$&\+,:=@\/;\?\%}
|
12
|
+
UNSAFE_CHARS = %r{\\ \"\#<>\[\]\^\`\|\~\{\}}
|
13
|
+
HOST_HEAD = '(?:[a-z0-9\-]+\.)+'
|
14
|
+
HOST_TLD = '(?:[a-z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum)'
|
15
|
+
|
16
|
+
def host_valid?
|
17
|
+
!!(host =~ %r{\A#{HOST_HEAD}#{HOST_TLD}\z}i)
|
18
|
+
end
|
19
|
+
def path_valid?
|
20
|
+
!!(path =~ %r{\A[#{PATH_CHARS}%]*\z})
|
21
|
+
end
|
22
|
+
def simple_connection_part?
|
23
|
+
( ['http', nil].include?(scheme) &&
|
24
|
+
[80, nil].include?(port) &&
|
25
|
+
(self.to_hash.values_at(:password, :user).join.blank?) )
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Does this look like a
|
30
|
+
#
|
31
|
+
def simple?
|
32
|
+
host_valid? && path_valid? && simple_connection_part?
|
33
|
+
end
|
34
|
+
|
35
|
+
#
|
36
|
+
# +revhost+
|
37
|
+
# the dot-reversed host:
|
38
|
+
# foo.company.com => com.company.foo
|
39
|
+
#
|
40
|
+
def revhost
|
41
|
+
return host unless host =~ /\./
|
42
|
+
host.split('.').reverse.join('.')
|
43
|
+
end
|
44
|
+
#
|
45
|
+
# +uuid+ -- RFC-4122 ver.5 uuid; guaranteed to be universally unique
|
46
|
+
#
|
47
|
+
# See
|
48
|
+
# http://www.faqs.org/rfcs/rfc4122.html
|
49
|
+
#
|
50
|
+
def url_uuid
|
51
|
+
UUID.sha1_create(UUID_URL_NAMESPACE, self.normalize.to_s)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class << Addressable::URI
|
57
|
+
alias_method :encode_segment, :encode_component if ! defined?(encode_segment)
|
58
|
+
alias_method :unencode_segment, :unencode_component if ! defined?(unencode_segment)
|
59
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'uuidtools'
|
2
|
+
|
3
|
+
class UUID
|
4
|
+
|
5
|
+
#
|
6
|
+
# A string suitable for using as a path name --
|
7
|
+
#
|
8
|
+
# Ex.
|
9
|
+
# urn:uuid:3c0dce44-80a8-11dd-a897-001ff35a0a8b =>
|
10
|
+
# urn_uuid/3c0dce44/80a8/11dd/a897/001ff35a0a8b
|
11
|
+
#
|
12
|
+
# It's well possible there are more perspicacious choices for points to split
|
13
|
+
# the string, but until we hit that limit this'll do.
|
14
|
+
#
|
15
|
+
def to_path
|
16
|
+
'urn_uuid/' + to_s.gsub(/[\:\-]/,'/')
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.hex_to_str str
|
20
|
+
/([\da-f]{8})([\da-f]{4})([\da-f]{4})([\da-f]{4})([\da-f]{12})/.match(str).captures.join '-'
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
def self.parse_hex str
|
25
|
+
parse(UUID.hex_to_str(str))
|
26
|
+
end
|
27
|
+
|
28
|
+
# Overrides UUIDTools -- force 32 hex digits (leading zeros)
|
29
|
+
def hexdigest
|
30
|
+
"%032x" % self.to_i
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
# Return true if <tt>email</tt> is a valid email address
|
4
|
+
def is_email?(email)
|
5
|
+
raise ArgumentError, "'email' must be a string" if email.class != String
|
6
|
+
return false if email.empty?
|
7
|
+
|
8
|
+
parts = email.split('@')
|
9
|
+
return false if parts.size != 2
|
10
|
+
|
11
|
+
local = parts.first
|
12
|
+
return false if not local =~ /[a-zA-Z0-9_~=+-.]*/ # allowed characters
|
13
|
+
return false if local[0,1] == '.' # starts with .
|
14
|
+
return false if local[-1,1] == '.' # end with .
|
15
|
+
return false if local.include?('..') # can't repeat .
|
16
|
+
|
17
|
+
domain = parts.last
|
18
|
+
return false if not is_domain?(domain)
|
19
|
+
|
20
|
+
return true
|
21
|
+
end
|
22
|
+
|
23
|
+
# Return true if <tt>domain</tt> is a valid domain name
|
24
|
+
def is_domain?(domain)
|
25
|
+
raise ArgumentError, "'domain' must be a string" if domain.class != String
|
26
|
+
return false if domain.empty?
|
27
|
+
|
28
|
+
return false if domain.size > 255 # max number of characters in a domain
|
29
|
+
return false if not domain =~ /^[a-zA-Z0-9.\-]+$/ # allowed characters
|
30
|
+
parts = domain.split('.')
|
31
|
+
return false if parts.size > 127 # max number of subdomains
|
32
|
+
parts.all? {|part| return false if part.size > 63} # max number of characters in a subdomain
|
33
|
+
|
34
|
+
return true
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
# puts "#{File.basename(__FILE__)}: As you shape your body to the confines of your container you feel a tremendous sense of validation." # at bottom
|
@@ -0,0 +1,113 @@
|
|
1
|
+
|
2
|
+
class ActiveRecord::Base
|
3
|
+
class << self
|
4
|
+
end
|
5
|
+
# def merge!(hsh)
|
6
|
+
# hsh = hsh.dup
|
7
|
+
# # puts hsh.to_yaml
|
8
|
+
# # has_many datasets, notes, fields, contributors
|
9
|
+
# self.class.reflect_on_all_associations.each do |ass|
|
10
|
+
# # ["@macro", "@class_name", "@name", "@primary_key_name", "@options",
|
11
|
+
# # "@klass",
|
12
|
+
# # "@through_reflection",
|
13
|
+
# # "@active_record",
|
14
|
+
# puts [ass.name, ass.macro, ass.primary_key_name].to_yaml
|
15
|
+
# if ass.macro == :has_many
|
16
|
+
# els = hsh.delete(ass.name.to_s) || []
|
17
|
+
# puts "!!!!!!!!!!!!!!!!!!!!!!!!!!", els, '!!'
|
18
|
+
# els.each do |el|
|
19
|
+
# puts el
|
20
|
+
# self[ass.name] = ass.klass.new().merge!(el)
|
21
|
+
# end
|
22
|
+
# end
|
23
|
+
# hsh.each do |key,val|
|
24
|
+
# self[key] = val
|
25
|
+
# end
|
26
|
+
# p self
|
27
|
+
# p self.datasets if self.respond_to? 'datasets'
|
28
|
+
# end
|
29
|
+
# end
|
30
|
+
def undump(hsh)
|
31
|
+
puts "unumping from #{hsh.to_json}"
|
32
|
+
hsh.each{ |k,v| self[k] = v }
|
33
|
+
self.save!
|
34
|
+
self
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
class Pool < ActiveRecord::Base
|
39
|
+
def undump(hsh)
|
40
|
+
{ :datasets => Dataset, :fields => Field,
|
41
|
+
:contributors => Contributor, :pool_notes => PoolNote }.each do |field, klass|
|
42
|
+
vals = hsh.delete(field.to_s) || []
|
43
|
+
puts "Undumping #{vals} info #{field}"
|
44
|
+
self[field.to_s] = vals.map{|val| f = klass.new().undump(val); f.save!; f}
|
45
|
+
end
|
46
|
+
super
|
47
|
+
self
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class Dataset < ActiveRecord::Base
|
52
|
+
def undump(hsh)
|
53
|
+
{ :datasets => Dataset, :fields => Field,
|
54
|
+
:contributors => Contributor, :dataset_notes => DatasetNote }.each do |field, klass|
|
55
|
+
vals = hsh.delete(field.to_s) || []
|
56
|
+
puts "Undumping #{vals} info #{field}"
|
57
|
+
self[field.to_s] = vals.map{|val| f = klass.new().undump(val); f.save!; f}
|
58
|
+
end
|
59
|
+
super
|
60
|
+
puts "Got Dataset #{self.to_yaml}"
|
61
|
+
self
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class IMW < OpenStruct
|
66
|
+
|
67
|
+
#
|
68
|
+
# Takes an Infochimps Stupid Schema stream and
|
69
|
+
# constructs the corresponding objects.
|
70
|
+
#
|
71
|
+
# Here are the rules:
|
72
|
+
# * the schema has the structure
|
73
|
+
# # this has to be first.
|
74
|
+
# - infochimps_schema:
|
75
|
+
# schema_version: 0.2 # in case stuff changes
|
76
|
+
# # then any number of imw objects:
|
77
|
+
# - pool: (...)
|
78
|
+
# fields: [era, innings_pitched,
|
79
|
+
# - dataset: (...)
|
80
|
+
# fields:
|
81
|
+
# - name: Earned Run Average
|
82
|
+
# handle: era
|
83
|
+
# concept: baseball-era
|
84
|
+
# units: earned_runs / (9*innings_pitched)
|
85
|
+
# - contributor: (...)
|
86
|
+
# - field: (...)
|
87
|
+
#
|
88
|
+
# * Objects are referred to by __handle__, *NOT* __id__. If an ID is
|
89
|
+
# included, and an object exists with a non-matching ID or handle,
|
90
|
+
# an error will be raised.
|
91
|
+
#
|
92
|
+
# * We want to make the schema files maintainable by hand, which means that
|
93
|
+
# the loader tries to be smart about inline-defined objects. That is, you
|
94
|
+
# can either refer to (via handle) a field defined elsewhere, or you can
|
95
|
+
# define the field in whole, and trust that the Right Thing will
|
96
|
+
# happen. This presents the problem of collisions, though. If a bulk object
|
97
|
+
# update arrives, we need to know whom to believe -- bulk loader or
|
98
|
+
# database. In the absence of versioning: we look up the object by its
|
99
|
+
# handle. If there's an existing object, any new information (fields with
|
100
|
+
# values in new that are blank in old) is added to it. If the object is
|
101
|
+
# defined at the top level, it wins; if the object is defined as a sub field
|
102
|
+
# it loses.
|
103
|
+
#
|
104
|
+
# * Every interesting object (Pool, Dataset, Contributor, Field) has a desc:
|
105
|
+
# attribute (for Pool and Dataset it's virtual but never mind) to describe
|
106
|
+
# __itself__. Additionally, every interesting relationship has its own desc: field.
|
107
|
+
#
|
108
|
+
|
109
|
+
def self.undump(schema)
|
110
|
+
|
111
|
+
# compact then merge -- kill off blank
|
112
|
+
end
|
113
|
+
end
|