imw 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +15 -0
- data/CHANGELOG +0 -0
- data/LICENSE +674 -0
- data/README.rdoc +101 -0
- data/Rakefile +20 -0
- data/VERSION +1 -0
- data/etc/imwrc.rb +76 -0
- data/lib/imw.rb +42 -0
- data/lib/imw/boot.rb +58 -0
- data/lib/imw/dataset.rb +233 -0
- data/lib/imw/dataset/datamapper.rb +66 -0
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
- data/lib/imw/dataset/loaddump.rb +50 -0
- data/lib/imw/dataset/old/file_collection.rb +88 -0
- data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
- data/lib/imw/dataset/scaffold.rb +132 -0
- data/lib/imw/dataset/scraped_uri.rb +305 -0
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
- data/lib/imw/dataset/scrub/scrub.rb +147 -0
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
- data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
- data/lib/imw/dataset/scrub/slug.rb +101 -0
- data/lib/imw/dataset/stats.rb +73 -0
- data/lib/imw/dataset/stats/counter.rb +23 -0
- data/lib/imw/dataset/task.rb +38 -0
- data/lib/imw/dataset/workflow.rb +81 -0
- data/lib/imw/files.rb +110 -0
- data/lib/imw/files/archive.rb +113 -0
- data/lib/imw/files/basicfile.rb +122 -0
- data/lib/imw/files/binary.rb +28 -0
- data/lib/imw/files/compressed_file.rb +93 -0
- data/lib/imw/files/compressed_files_and_archives.rb +348 -0
- data/lib/imw/files/compressible.rb +103 -0
- data/lib/imw/files/csv.rb +112 -0
- data/lib/imw/files/json.rb +41 -0
- data/lib/imw/files/sgml.rb +65 -0
- data/lib/imw/files/text.rb +68 -0
- data/lib/imw/files/yaml.rb +46 -0
- data/lib/imw/packagers.rb +8 -0
- data/lib/imw/packagers/archiver.rb +108 -0
- data/lib/imw/packagers/s3_mover.rb +28 -0
- data/lib/imw/parsers.rb +7 -0
- data/lib/imw/parsers/html_parser.rb +382 -0
- data/lib/imw/parsers/html_parser/matchers.rb +306 -0
- data/lib/imw/parsers/line_parser.rb +87 -0
- data/lib/imw/parsers/regexp_parser.rb +72 -0
- data/lib/imw/utils.rb +24 -0
- data/lib/imw/utils/components.rb +61 -0
- data/lib/imw/utils/config.rb +46 -0
- data/lib/imw/utils/error.rb +54 -0
- data/lib/imw/utils/extensions/array.rb +125 -0
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
- data/lib/imw/utils/extensions/core.rb +43 -0
- data/lib/imw/utils/extensions/dir.rb +24 -0
- data/lib/imw/utils/extensions/file_core.rb +64 -0
- data/lib/imw/utils/extensions/hash.rb +218 -0
- data/lib/imw/utils/extensions/hpricot.rb +48 -0
- data/lib/imw/utils/extensions/string.rb +49 -0
- data/lib/imw/utils/extensions/struct.rb +42 -0
- data/lib/imw/utils/extensions/symbol.rb +28 -0
- data/lib/imw/utils/extensions/typed_struct.rb +22 -0
- data/lib/imw/utils/extensions/uri.rb +59 -0
- data/lib/imw/utils/log.rb +67 -0
- data/lib/imw/utils/misc.rb +63 -0
- data/lib/imw/utils/paths.rb +115 -0
- data/lib/imw/utils/uri.rb +59 -0
- data/lib/imw/utils/uuid.rb +33 -0
- data/lib/imw/utils/validate.rb +38 -0
- data/lib/imw/utils/version.rb +12 -0
- data/lib/imw/utils/view.rb +113 -0
- data/lib/imw/utils/view/dump_csv.rb +112 -0
- data/lib/imw/utils/view/dump_csv_older.rb +117 -0
- data/spec/data/sample.csv +131 -0
- data/spec/data/sample.tsv +131 -0
- data/spec/data/sample.txt +131 -0
- data/spec/data/sample.xml +653 -0
- data/spec/data/sample.yaml +652 -0
- data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
- data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
- data/spec/imw/files/archive_spec.rb +118 -0
- data/spec/imw/files/basicfile_spec.rb +121 -0
- data/spec/imw/files/bz2_spec.rb +32 -0
- data/spec/imw/files/compressed_file_spec.rb +96 -0
- data/spec/imw/files/compressible_spec.rb +100 -0
- data/spec/imw/files/file_spec.rb +144 -0
- data/spec/imw/files/gz_spec.rb +32 -0
- data/spec/imw/files/rar_spec.rb +33 -0
- data/spec/imw/files/tar_spec.rb +31 -0
- data/spec/imw/files/text_spec.rb +23 -0
- data/spec/imw/files/zip_spec.rb +31 -0
- data/spec/imw/files_spec.rb +38 -0
- data/spec/imw/packagers/archiver_spec.rb +125 -0
- data/spec/imw/packagers/s3_mover_spec.rb +7 -0
- data/spec/imw/parsers/line_parser_spec.rb +96 -0
- data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
- data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
- data/spec/imw/utils/extensions/find_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +38 -0
- data/spec/imw/workflow/rip/local_spec.rb +89 -0
- data/spec/imw/workflow/rip_spec.rb +27 -0
- data/spec/rcov.opts +1 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +32 -0
- data/spec/support/archive_contents_matcher.rb +94 -0
- data/spec/support/custom_matchers.rb +21 -0
- data/spec/support/directory_contents_matcher.rb +61 -0
- data/spec/support/extensions.rb +18 -0
- data/spec/support/file_contents_matcher.rb +50 -0
- data/spec/support/random.rb +210 -0
- data/spec/support/without_regard_to_order_matcher.rb +58 -0
- metadata +196 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
require 'logger'
|
|
2
|
+
|
|
3
|
+
module IMW
|
|
4
|
+
LOG_FILE_DESTINATION = STDERR unless defined?(LOG_FILE_DESTINATION)
|
|
5
|
+
LOG_TIMEFORMAT = "%Y%m%d-%H:%M:%S " unless defined?(LOG_TIMEFORMAT)
|
|
6
|
+
|
|
7
|
+
class << self; attr_accessor :log end
|
|
8
|
+
#
|
|
9
|
+
# Create a Logger and point it at LOG_FILE_DESTINATION
|
|
10
|
+
#
|
|
11
|
+
# LOG_FILE_DESTINATION is STDOUT by default; redefine it in your
|
|
12
|
+
# ~/.imwrc, or set IMW.log yourself, if that's not cool.
|
|
13
|
+
#
|
|
14
|
+
def self.instantiate_logger!
|
|
15
|
+
IMW.log ||= Logger.new(LOG_FILE_DESTINATION)
|
|
16
|
+
IMW.log.datetime_format = "%Y%m%d-%H:%M:%S "
|
|
17
|
+
IMW.log.level = Logger::INFO
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def announce *events
|
|
21
|
+
options = events.extract_options!
|
|
22
|
+
options.reverse_merge! :level => Logger::INFO
|
|
23
|
+
# puts [options, events ].inspect, "*"*76
|
|
24
|
+
IMW.log.add options[:level], events.join("\n")
|
|
25
|
+
end
|
|
26
|
+
def banner *events
|
|
27
|
+
options = events.extract_options!
|
|
28
|
+
options.reverse_merge! :level => Logger::INFO
|
|
29
|
+
["*"*75, events, "*"*75].flatten.each{|ev| announce(ev, options) }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
PROGRESS_TRACKERS = {}
|
|
33
|
+
#
|
|
34
|
+
# When the slowly-changing tracked variable +var+ changes value,
|
|
35
|
+
# announce its new value. Always announces on first call.
|
|
36
|
+
#
|
|
37
|
+
# Ex:
|
|
38
|
+
# track_progress :indexing_names, name[0..0] # announce at each initial letter
|
|
39
|
+
# track_progress :files, (i % 1000) # announce at each 1,000 iterations
|
|
40
|
+
#
|
|
41
|
+
def track_progress tracker, val
|
|
42
|
+
unless (IMW::PROGRESS_TRACKERS.include?(tracker)) &&
|
|
43
|
+
(IMW::PROGRESS_TRACKERS[tracker] == val)
|
|
44
|
+
announce "#{tracker.to_s.gsub(/_/,' ')}: #{val}"
|
|
45
|
+
IMW::PROGRESS_TRACKERS[tracker] = val
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
PROGRESS_COUNTERS = {}
|
|
50
|
+
#
|
|
51
|
+
# Log repetitions in a given context
|
|
52
|
+
#
|
|
53
|
+
# At every n'th (default 1000) call,
|
|
54
|
+
# announce progress in the IMW.log
|
|
55
|
+
#
|
|
56
|
+
def track_count tracker, every=1000
|
|
57
|
+
PROGRESS_COUNTERS[tracker] ||= 0
|
|
58
|
+
PROGRESS_COUNTERS[tracker] += 1
|
|
59
|
+
chunk = every * (PROGRESS_COUNTERS[tracker]/every).to_i
|
|
60
|
+
track_progress "count_of_#{tracker}", chunk
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
#
|
|
65
|
+
# Make the default logger
|
|
66
|
+
#
|
|
67
|
+
IMW.instantiate_logger!
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
# Return a string representing the current UTC time in the IMW
|
|
3
|
+
# format.
|
|
4
|
+
def self.current_utc_time_string
|
|
5
|
+
Time.now.utc.strftime(IMW::STRFTIME_FORMAT)
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# A simple counter. The +value+ and +add+ methods read and
|
|
10
|
+
# increment the counter's value.
|
|
11
|
+
#
|
|
12
|
+
# counter = IMW::Counter.new
|
|
13
|
+
# counter.value #=> 0
|
|
14
|
+
# counter.add 1
|
|
15
|
+
# counter.value #=> 1
|
|
16
|
+
#
|
|
17
|
+
# The +next!+ method acts as like C's <tt>value++</tt>, incrementing
|
|
18
|
+
# +value+ _after_ it is referenced.
|
|
19
|
+
#
|
|
20
|
+
# counter = IMW::Counter.new
|
|
21
|
+
# counter.value #=> 0
|
|
22
|
+
# counter.next! #=> 0
|
|
23
|
+
# counter.value #=> 1
|
|
24
|
+
#
|
|
25
|
+
# Counters can also be reset
|
|
26
|
+
#
|
|
27
|
+
# counter.reset!
|
|
28
|
+
# counter.value #=> 0
|
|
29
|
+
class Counter
|
|
30
|
+
|
|
31
|
+
attr_accessor :value, :starting_value, :increment
|
|
32
|
+
|
|
33
|
+
# Return a new Counter. The first argument is the starting value
|
|
34
|
+
# (defaults to 0) and the second is the increment (defaults to 1).
|
|
35
|
+
def initialize starting_value=0,increment=1
|
|
36
|
+
@starting_value = starting_value
|
|
37
|
+
@value = starting_value
|
|
38
|
+
@increment = increment
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Add +amount+ (defaults to the value of <tt>@increment</tt>).
|
|
42
|
+
def add amount=nil
|
|
43
|
+
@value += amount || @increment
|
|
44
|
+
end
|
|
45
|
+
alias_method :add!, :add
|
|
46
|
+
|
|
47
|
+
# Increment the counter by <tt>@increment</tt> but return its
|
|
48
|
+
# value _before_ being incremented.
|
|
49
|
+
def next!
|
|
50
|
+
old_value = @value
|
|
51
|
+
@value += @increment
|
|
52
|
+
old_value
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Reset the counter to +value+ (defaults to the value of
|
|
56
|
+
# <tt>@starting_value</tt>).
|
|
57
|
+
def reset! value=nil
|
|
58
|
+
@value = value || @starting_value
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# puts "#{File.basename(__FILE__)}: Your Monkeywrench seems suddenly more utilisable." # at bottom
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
#
|
|
2
|
+
# h2. lib/imw/utils/paths.rb -- defines the path structure of IMW
|
|
3
|
+
#
|
|
4
|
+
# == About
|
|
5
|
+
#
|
|
6
|
+
# IMW uses lots of different directories to keep information on data
|
|
7
|
+
# and datasets separate. This module interfaces with the
|
|
8
|
+
# configuration files to establish the paths to these IMW directories
|
|
9
|
+
# and provides functions and mixins for IMW objects to use to access
|
|
10
|
+
# these paths.
|
|
11
|
+
#
|
|
12
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
13
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
14
|
+
# License:: GPL 3.0
|
|
15
|
+
# Website:: http://infinitemonkeywrench.org/
|
|
16
|
+
#
|
|
17
|
+
|
|
18
|
+
module IMW
|
|
19
|
+
|
|
20
|
+
# Implements methods designed to work with an object's
|
|
21
|
+
# <tt>@paths</tt> attributes, adding and deleting symbolic
|
|
22
|
+
# references to paths and expanding calls to +path_to+ from that
|
|
23
|
+
# attribute or (when a miss) from <tt>IMW::PATHS</tt>.
|
|
24
|
+
#
|
|
25
|
+
# An including class should therefore define an array attribute
|
|
26
|
+
# <tt>@paths</tt>.
|
|
27
|
+
module Paths
|
|
28
|
+
|
|
29
|
+
# Expands a shorthand workflow path specification to an
|
|
30
|
+
# actual file path.
|
|
31
|
+
#
|
|
32
|
+
# add_path :mlb_08, 'gd2.mlb.com/components/game/mlb/year_2008'
|
|
33
|
+
# path_to :ripd, :mlb_08, 'month_06', 'day_08', 'miniscoreboard.xml'
|
|
34
|
+
# => (...)/data/ripd/gd2.mlb.com/components/game/mlb/year_2008/month_06/day_08/miniscoreboard.xml
|
|
35
|
+
def path_to *pathsegs
|
|
36
|
+
begin
|
|
37
|
+
path = Pathname.new path_to_helper(*pathsegs)
|
|
38
|
+
path.absolute? ? File.expand_path(path) : path.to_s
|
|
39
|
+
rescue Exception => e
|
|
40
|
+
raise("Can't find path to '#{pathsegs}': #{e}");
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
def path_to_helper *pathsegs # :nodoc:
|
|
46
|
+
# +path_to_helper+ handles the recursive calls for +path_to+.
|
|
47
|
+
expanded = pathsegs.flatten.compact.map do |pathseg|
|
|
48
|
+
case
|
|
49
|
+
when pathseg.is_a?(Symbol) && @paths.include?(pathseg) then path_to(@paths[pathseg])
|
|
50
|
+
when pathseg.is_a?(Symbol) && IMW::PATHS.include?(pathseg) then path_to(IMW::PATHS[pathseg])
|
|
51
|
+
when pathseg.is_a?(Symbol) then raise IMW::PathError.new("No path expansion set for #{pathseg.inspect}")
|
|
52
|
+
else pathseg
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
File.join(*expanded)
|
|
56
|
+
end
|
|
57
|
+
public
|
|
58
|
+
|
|
59
|
+
# Adds a symbolic path for expansion by +path_to+.
|
|
60
|
+
def add_path sym, *pathsegs
|
|
61
|
+
@paths[sym] = pathsegs.flatten
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Removes a symbolic path for expansion by +path_to+.
|
|
65
|
+
def remove_path sym
|
|
66
|
+
@paths.delete sym if @paths.include? sym
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
class Dataset
|
|
71
|
+
attr_reader :paths
|
|
72
|
+
include IMW::Paths
|
|
73
|
+
|
|
74
|
+
private
|
|
75
|
+
def set_paths
|
|
76
|
+
@paths = {}
|
|
77
|
+
add_path :self, File.dirname(eval('__FILE__'))
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def self.path_to *pathsegs
|
|
82
|
+
begin
|
|
83
|
+
path = Pathname.new IMW.path_to_helper(*pathsegs)
|
|
84
|
+
path.absolute? ? File.expand_path(path) : path.to_s
|
|
85
|
+
rescue Exception => e
|
|
86
|
+
raise("Can't find path to '#{pathsegs}': #{e}");
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
private
|
|
91
|
+
def self.path_to_helper *pathsegs # :nodoc:
|
|
92
|
+
# +path_to_helper+ handles the recursive calls for +path_to+.
|
|
93
|
+
expanded = pathsegs.flatten.compact.map do |pathseg|
|
|
94
|
+
case
|
|
95
|
+
when pathseg.is_a?(Symbol) && IMW::PATHS.include?(pathseg) then path_to(IMW::PATHS[pathseg])
|
|
96
|
+
when pathseg.is_a?(Symbol) then raise IMW::PathError.new("No path expansion set for #{pathseg.inspect}")
|
|
97
|
+
else pathseg
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
File.join(*expanded)
|
|
101
|
+
end
|
|
102
|
+
public
|
|
103
|
+
|
|
104
|
+
# Adds a symbolic path for expansion by +path_to+.
|
|
105
|
+
def self.add_path sym, *pathsegs
|
|
106
|
+
IMW::PATHS[sym] = pathsegs.flatten
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Removes a symbolic path for expansion by +path_to+.
|
|
110
|
+
def self.remove_path sym
|
|
111
|
+
IMW::PATHS.delete sym if IMW::PATHS.include? sym
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# puts "#{File.basename(__FILE__)}: Your monkeywrench glows alternately dim then bright as you wander, suggesting to you which paths to take."
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
require 'imw/utils'
|
|
2
|
+
require 'imw/utils/uuid'
|
|
3
|
+
require 'addressable/uri'
|
|
4
|
+
module Addressable
|
|
5
|
+
#
|
|
6
|
+
# Add the #scrubbed and #revhost calls
|
|
7
|
+
#
|
|
8
|
+
class URI
|
|
9
|
+
SAFE_CHARS = %r{a-zA-Z0-9\-\._!\(\)\*\'}
|
|
10
|
+
PATH_CHARS = %r{#{SAFE_CHARS}\$&\+,:=@\/;}
|
|
11
|
+
RESERVED_CHARS = %r{\$&\+,:=@\/;\?\%}
|
|
12
|
+
UNSAFE_CHARS = %r{\\ \"\#<>\[\]\^\`\|\~\{\}}
|
|
13
|
+
HOST_HEAD = '(?:[a-z0-9\-]+\.)+'
|
|
14
|
+
HOST_TLD = '(?:[a-z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum)'
|
|
15
|
+
|
|
16
|
+
def host_valid?
|
|
17
|
+
!!(host =~ %r{\A#{HOST_HEAD}#{HOST_TLD}\z}i)
|
|
18
|
+
end
|
|
19
|
+
def path_valid?
|
|
20
|
+
!!(path =~ %r{\A[#{PATH_CHARS}%]*\z})
|
|
21
|
+
end
|
|
22
|
+
def simple_connection_part?
|
|
23
|
+
( ['http', nil].include?(scheme) &&
|
|
24
|
+
[80, nil].include?(port) &&
|
|
25
|
+
(self.to_hash.values_at(:password, :user).join.blank?) )
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
#
|
|
29
|
+
# Does this look like a
|
|
30
|
+
#
|
|
31
|
+
def simple?
|
|
32
|
+
host_valid? && path_valid? && simple_connection_part?
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
#
|
|
36
|
+
# +revhost+
|
|
37
|
+
# the dot-reversed host:
|
|
38
|
+
# foo.company.com => com.company.foo
|
|
39
|
+
#
|
|
40
|
+
def revhost
|
|
41
|
+
return host unless host =~ /\./
|
|
42
|
+
host.split('.').reverse.join('.')
|
|
43
|
+
end
|
|
44
|
+
#
|
|
45
|
+
# +uuid+ -- RFC-4122 ver.5 uuid; guaranteed to be universally unique
|
|
46
|
+
#
|
|
47
|
+
# See
|
|
48
|
+
# http://www.faqs.org/rfcs/rfc4122.html
|
|
49
|
+
#
|
|
50
|
+
def url_uuid
|
|
51
|
+
UUID.sha1_create(UUID_URL_NAMESPACE, self.normalize.to_s)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
class << Addressable::URI
|
|
57
|
+
alias_method :encode_segment, :encode_component if ! defined?(encode_segment)
|
|
58
|
+
alias_method :unencode_segment, :unencode_component if ! defined?(unencode_segment)
|
|
59
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
require 'uuidtools'
|
|
2
|
+
|
|
3
|
+
class UUID
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
# A string suitable for using as a path name --
|
|
7
|
+
#
|
|
8
|
+
# Ex.
|
|
9
|
+
# urn:uuid:3c0dce44-80a8-11dd-a897-001ff35a0a8b =>
|
|
10
|
+
# urn_uuid/3c0dce44/80a8/11dd/a897/001ff35a0a8b
|
|
11
|
+
#
|
|
12
|
+
# It's well possible there are more perspicacious choices for points to split
|
|
13
|
+
# the string, but until we hit that limit this'll do.
|
|
14
|
+
#
|
|
15
|
+
def to_path
|
|
16
|
+
'urn_uuid/' + to_s.gsub(/[\:\-]/,'/')
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def self.hex_to_str str
|
|
20
|
+
/([\da-f]{8})([\da-f]{4})([\da-f]{4})([\da-f]{4})([\da-f]{12})/.match(str).captures.join '-'
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def self.parse_hex str
|
|
25
|
+
parse(UUID.hex_to_str(str))
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Overrides UUIDTools -- force 32 hex digits (leading zeros)
|
|
29
|
+
def hexdigest
|
|
30
|
+
"%032x" % self.to_i
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
# Return true if <tt>email</tt> is a valid email address
|
|
4
|
+
def is_email?(email)
|
|
5
|
+
raise ArgumentError, "'email' must be a string" if email.class != String
|
|
6
|
+
return false if email.empty?
|
|
7
|
+
|
|
8
|
+
parts = email.split('@')
|
|
9
|
+
return false if parts.size != 2
|
|
10
|
+
|
|
11
|
+
local = parts.first
|
|
12
|
+
return false if not local =~ /[a-zA-Z0-9_~=+-.]*/ # allowed characters
|
|
13
|
+
return false if local[0,1] == '.' # starts with .
|
|
14
|
+
return false if local[-1,1] == '.' # end with .
|
|
15
|
+
return false if local.include?('..') # can't repeat .
|
|
16
|
+
|
|
17
|
+
domain = parts.last
|
|
18
|
+
return false if not is_domain?(domain)
|
|
19
|
+
|
|
20
|
+
return true
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Return true if <tt>domain</tt> is a valid domain name
|
|
24
|
+
def is_domain?(domain)
|
|
25
|
+
raise ArgumentError, "'domain' must be a string" if domain.class != String
|
|
26
|
+
return false if domain.empty?
|
|
27
|
+
|
|
28
|
+
return false if domain.size > 255 # max number of characters in a domain
|
|
29
|
+
return false if not domain =~ /^[a-zA-Z0-9.\-]+$/ # allowed characters
|
|
30
|
+
parts = domain.split('.')
|
|
31
|
+
return false if parts.size > 127 # max number of subdomains
|
|
32
|
+
parts.all? {|part| return false if part.size > 63} # max number of characters in a subdomain
|
|
33
|
+
|
|
34
|
+
return true
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# puts "#{File.basename(__FILE__)}: As you shape your body to the confines of your container you feel a tremendous sense of validation." # at bottom
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
|
|
2
|
+
class ActiveRecord::Base
|
|
3
|
+
class << self
|
|
4
|
+
end
|
|
5
|
+
# def merge!(hsh)
|
|
6
|
+
# hsh = hsh.dup
|
|
7
|
+
# # puts hsh.to_yaml
|
|
8
|
+
# # has_many datasets, notes, fields, contributors
|
|
9
|
+
# self.class.reflect_on_all_associations.each do |ass|
|
|
10
|
+
# # ["@macro", "@class_name", "@name", "@primary_key_name", "@options",
|
|
11
|
+
# # "@klass",
|
|
12
|
+
# # "@through_reflection",
|
|
13
|
+
# # "@active_record",
|
|
14
|
+
# puts [ass.name, ass.macro, ass.primary_key_name].to_yaml
|
|
15
|
+
# if ass.macro == :has_many
|
|
16
|
+
# els = hsh.delete(ass.name.to_s) || []
|
|
17
|
+
# puts "!!!!!!!!!!!!!!!!!!!!!!!!!!", els, '!!'
|
|
18
|
+
# els.each do |el|
|
|
19
|
+
# puts el
|
|
20
|
+
# self[ass.name] = ass.klass.new().merge!(el)
|
|
21
|
+
# end
|
|
22
|
+
# end
|
|
23
|
+
# hsh.each do |key,val|
|
|
24
|
+
# self[key] = val
|
|
25
|
+
# end
|
|
26
|
+
# p self
|
|
27
|
+
# p self.datasets if self.respond_to? 'datasets'
|
|
28
|
+
# end
|
|
29
|
+
# end
|
|
30
|
+
def undump(hsh)
|
|
31
|
+
puts "unumping from #{hsh.to_json}"
|
|
32
|
+
hsh.each{ |k,v| self[k] = v }
|
|
33
|
+
self.save!
|
|
34
|
+
self
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
class Pool < ActiveRecord::Base
|
|
39
|
+
def undump(hsh)
|
|
40
|
+
{ :datasets => Dataset, :fields => Field,
|
|
41
|
+
:contributors => Contributor, :pool_notes => PoolNote }.each do |field, klass|
|
|
42
|
+
vals = hsh.delete(field.to_s) || []
|
|
43
|
+
puts "Undumping #{vals} info #{field}"
|
|
44
|
+
self[field.to_s] = vals.map{|val| f = klass.new().undump(val); f.save!; f}
|
|
45
|
+
end
|
|
46
|
+
super
|
|
47
|
+
self
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
class Dataset < ActiveRecord::Base
|
|
52
|
+
def undump(hsh)
|
|
53
|
+
{ :datasets => Dataset, :fields => Field,
|
|
54
|
+
:contributors => Contributor, :dataset_notes => DatasetNote }.each do |field, klass|
|
|
55
|
+
vals = hsh.delete(field.to_s) || []
|
|
56
|
+
puts "Undumping #{vals} info #{field}"
|
|
57
|
+
self[field.to_s] = vals.map{|val| f = klass.new().undump(val); f.save!; f}
|
|
58
|
+
end
|
|
59
|
+
super
|
|
60
|
+
puts "Got Dataset #{self.to_yaml}"
|
|
61
|
+
self
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
class IMW < OpenStruct
|
|
66
|
+
|
|
67
|
+
#
|
|
68
|
+
# Takes an Infochimps Stupid Schema stream and
|
|
69
|
+
# constructs the corresponding objects.
|
|
70
|
+
#
|
|
71
|
+
# Here are the rules:
|
|
72
|
+
# * the schema has the structure
|
|
73
|
+
# # this has to be first.
|
|
74
|
+
# - infochimps_schema:
|
|
75
|
+
# schema_version: 0.2 # in case stuff changes
|
|
76
|
+
# # then any number of imw objects:
|
|
77
|
+
# - pool: (...)
|
|
78
|
+
# fields: [era, innings_pitched,
|
|
79
|
+
# - dataset: (...)
|
|
80
|
+
# fields:
|
|
81
|
+
# - name: Earned Run Average
|
|
82
|
+
# handle: era
|
|
83
|
+
# concept: baseball-era
|
|
84
|
+
# units: earned_runs / (9*innings_pitched)
|
|
85
|
+
# - contributor: (...)
|
|
86
|
+
# - field: (...)
|
|
87
|
+
#
|
|
88
|
+
# * Objects are referred to by __handle__, *NOT* __id__. If an ID is
|
|
89
|
+
# included, and an object exists with a non-matching ID or handle,
|
|
90
|
+
# an error will be raised.
|
|
91
|
+
#
|
|
92
|
+
# * We want to make the schema files maintainable by hand, which means that
|
|
93
|
+
# the loader tries to be smart about inline-defined objects. That is, you
|
|
94
|
+
# can either refer to (via handle) a field defined elsewhere, or you can
|
|
95
|
+
# define the field in whole, and trust that the Right Thing will
|
|
96
|
+
# happen. This presents the problem of collisions, though. If a bulk object
|
|
97
|
+
# update arrives, we need to know whom to believe -- bulk loader or
|
|
98
|
+
# database. In the absence of versioning: we look up the object by its
|
|
99
|
+
# handle. If there's an existing object, any new information (fields with
|
|
100
|
+
# values in new that are blank in old) is added to it. If the object is
|
|
101
|
+
# defined at the top level, it wins; if the object is defined as a sub field
|
|
102
|
+
# it loses.
|
|
103
|
+
#
|
|
104
|
+
# * Every interesting object (Pool, Dataset, Contributor, Field) has a desc:
|
|
105
|
+
# attribute (for Pool and Dataset it's virtual but never mind) to describe
|
|
106
|
+
# __itself__. Additionally, every interesting relationship has its own desc: field.
|
|
107
|
+
#
|
|
108
|
+
|
|
109
|
+
def self.undump(schema)
|
|
110
|
+
|
|
111
|
+
# compact then merge -- kill off blank
|
|
112
|
+
end
|
|
113
|
+
end
|