imw 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +194 -31
- data/VERSION +1 -1
- data/bin/imw +5 -0
- data/lib/imw/boot.rb +0 -15
- data/lib/imw/dataset/paths.rb +38 -0
- data/lib/imw/dataset/task.rb +21 -18
- data/lib/imw/dataset/workflow.rb +126 -65
- data/lib/imw/dataset.rb +56 -82
- data/lib/imw/files/basicfile.rb +3 -3
- data/lib/imw/files/compressed_files_and_archives.rb +23 -37
- data/lib/imw/files/csv.rb +2 -1
- data/lib/imw/files/directory.rb +62 -0
- data/lib/imw/files/excel.rb +84 -0
- data/lib/imw/files/sgml.rb +4 -23
- data/lib/imw/files.rb +62 -47
- data/lib/imw/packagers/archiver.rb +19 -1
- data/lib/imw/packagers/s3_mover.rb +8 -0
- data/lib/imw/parsers/html_parser/matchers.rb +251 -268
- data/lib/imw/parsers/html_parser.rb +181 -176
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +35 -0
- data/lib/imw/runner.rb +114 -0
- data/lib/imw/utils/extensions/core.rb +0 -16
- data/lib/imw/utils/paths.rb +0 -28
- data/lib/imw.rb +21 -32
- metadata +11 -19
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
- data/lib/imw/dataset/datamapper.rb +0 -66
- data/lib/imw/dataset/loaddump.rb +0 -50
- data/lib/imw/dataset/old/file_collection.rb +0 -88
- data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
- data/lib/imw/dataset/scaffold.rb +0 -132
- data/lib/imw/dataset/scraped_uri.rb +0 -305
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
- data/lib/imw/dataset/scrub/scrub.rb +0 -147
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
- data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
- data/lib/imw/dataset/scrub/slug.rb +0 -101
- data/lib/imw/dataset/stats/counter.rb +0 -23
- data/lib/imw/dataset/stats.rb +0 -73
@@ -1,101 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'addressable/uri'
|
3
|
-
require 'uuidtools'
|
4
|
-
require 'scrub'
|
5
|
-
require 'scrub_simple_url'
|
6
|
-
|
7
|
-
module IMW
|
8
|
-
|
9
|
-
#
|
10
|
-
#
|
11
|
-
# +handle+ -- reasonable effort at a uniq-ish, but human-comprehensible string
|
12
|
-
# Handle should only contain the characters A-Za-z0-9_-./
|
13
|
-
#
|
14
|
-
#
|
15
|
-
class Slug
|
16
|
-
# A humane representation of the handle ('that-one-time-at_foo')
|
17
|
-
attr_reader :handle
|
18
|
-
# The purportedly unique string ('')
|
19
|
-
attr_accessor :uniqish
|
20
|
-
|
21
|
-
def initialize handle
|
22
|
-
self.handle = handle
|
23
|
-
self.uniqish = handle
|
24
|
-
end
|
25
|
-
|
26
|
-
#
|
27
|
-
# Unless overridden, use the uniqish to
|
28
|
-
# make a name-based UUID within the infochimps.org
|
29
|
-
# namespace
|
30
|
-
#
|
31
|
-
def uuid
|
32
|
-
UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
|
33
|
-
end
|
34
|
-
|
35
|
-
# Handle with only \w characters -- safe for everything there be
|
36
|
-
def url_sane
|
37
|
-
return '' if !handle
|
38
|
-
handle.gsub(/[^\w\/\:]+/, '-').gsub(/_/, '__').gsub(%r{[/:]+}, '_')
|
39
|
-
end
|
40
|
-
|
41
|
-
def handle= t
|
42
|
-
@handle = self.class.sanitize_handle(t)
|
43
|
-
end
|
44
|
-
|
45
|
-
# Strip all but handle-safe characters
|
46
|
-
def self.sanitize_handle t, turd='-'
|
47
|
-
t = t.gsub(%r{[^\w\-\./]+}, turd)
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
#
|
52
|
-
# Uses a URL (that's locator, not URI) as a
|
53
|
-
# presumed-uniq identifier.
|
54
|
-
#
|
55
|
-
# +uniqish+ returns the full normalized URL
|
56
|
-
#
|
57
|
-
# +handle+ is formed from the dot-reversed host, the scheme (if not http) and a
|
58
|
-
# sanitized version of the path. (The query string, fragment, etc are stripped
|
59
|
-
# from the handle)
|
60
|
-
#
|
61
|
-
#
|
62
|
-
class URLSlug < Slug
|
63
|
-
attr_accessor :url
|
64
|
-
def initialize url_str
|
65
|
-
self.url = Addressable::URI.heuristic_parse(url_str).normalize
|
66
|
-
raise "Bad URL #{url}" unless url.host
|
67
|
-
self.uniqish = url.to_s
|
68
|
-
self.handle = munge_url
|
69
|
-
end
|
70
|
-
|
71
|
-
def uuid
|
72
|
-
UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
module Sluggable
|
80
|
-
protected
|
81
|
-
def create_slug
|
82
|
-
"Slugging #{self.attributes}"
|
83
|
-
if (self.class.slug_on == :url) || (self.name.blank?)
|
84
|
-
slug = IMW::URLSlug.new(self.url)
|
85
|
-
self.name = slug.handle
|
86
|
-
else
|
87
|
-
slug = IMW::Slug.new(self.name)
|
88
|
-
end
|
89
|
-
self.handle ||= slug.handle
|
90
|
-
end
|
91
|
-
public
|
92
|
-
|
93
|
-
def self.included base
|
94
|
-
base.before :save, :create_slug
|
95
|
-
base.class_eval do
|
96
|
-
def self.slug_on s=nil
|
97
|
-
@slug_on ||= s
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|
101
|
-
end
|
@@ -1,23 +0,0 @@
|
|
1
|
-
module IMW
|
2
|
-
class RecordCounter < Hash
|
3
|
-
def record val
|
4
|
-
self[val] ||= 0
|
5
|
-
self[val] += 1
|
6
|
-
end
|
7
|
-
|
8
|
-
def if_seen val, &block
|
9
|
-
if self[val]
|
10
|
-
yield
|
11
|
-
end
|
12
|
-
record val
|
13
|
-
end
|
14
|
-
|
15
|
-
def unless_seen val, &block
|
16
|
-
unless self[val]
|
17
|
-
yield
|
18
|
-
end
|
19
|
-
record val
|
20
|
-
end
|
21
|
-
|
22
|
-
end
|
23
|
-
end
|
data/lib/imw/dataset/stats.rb
DELETED
@@ -1,73 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/dataset/stats.rb -- statistics for datasets
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# Implements methods to calculate very basic statistical properties of
|
7
|
-
# a dataset.
|
8
|
-
#
|
9
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
10
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
-
# License:: GPL 3.0
|
12
|
-
# Website:: http://infinitemonkeywrench.org/
|
13
|
-
#
|
14
|
-
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
15
|
-
|
16
|
-
|
17
|
-
module IMW
|
18
|
-
class Dataset
|
19
|
-
#
|
20
|
-
# simple histogram
|
21
|
-
#
|
22
|
-
# Runs down one column/attribute of a dataset
|
23
|
-
# returning counts for that column
|
24
|
-
#
|
25
|
-
def hist slicer
|
26
|
-
counts = { }
|
27
|
-
els = slice(slicer)
|
28
|
-
els.each do |el|
|
29
|
-
counts[el] ||= 0
|
30
|
-
counts[el] += 1
|
31
|
-
end
|
32
|
-
self.class.new(counts.map{ |el,ct| [ct,el] })
|
33
|
-
end
|
34
|
-
|
35
|
-
def slice slicer
|
36
|
-
case
|
37
|
-
when slicer.respond_to?(:call) then self.map{ |row| slicer.call(row) }
|
38
|
-
else
|
39
|
-
self.map{ |row| row[slicer] }
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
#
|
44
|
-
# Report
|
45
|
-
#
|
46
|
-
def report slicer, opts={}
|
47
|
-
opts.reverse_merge! :n_top => 20, :hist_args => [], :fmt => "%7d\t%s", :do_hist => true
|
48
|
-
counts = hist(slicer)
|
49
|
-
report_hist data, counts, slicer, opts if opts[:do_hist]
|
50
|
-
report_sizes data, counts, slicer, opts
|
51
|
-
end
|
52
|
-
|
53
|
-
def report_sizes data, counts, slicer, opts={}
|
54
|
-
fmt = opts[:fmt]
|
55
|
-
puts fmt % [counts.length, "unique elements"]
|
56
|
-
puts fmt % [data.length, "total elements"]
|
57
|
-
puts fmt % [counts.find_all(&:nil?).length, "nil elements"]
|
58
|
-
uniqvals = counts.map{|ct,el| el}.reject(&:nil?)
|
59
|
-
puts " min:\t#{uniqvals.min}"
|
60
|
-
puts " max:\t#{uniqvals.max}"
|
61
|
-
end
|
62
|
-
|
63
|
-
# Most popular
|
64
|
-
def report_hist data, counts, slicer, opts={}
|
65
|
-
top = counts.sort_by{|ct,el| ct}[-opts[:n_top]..-1]
|
66
|
-
puts "Top #{opts[:n_top]} elements for slice through #{slicer}:"
|
67
|
-
puts " -freq-\t-element-"
|
68
|
-
puts top.map{ |ct,el| opts[:fmt] % [ct,el] }
|
69
|
-
puts "-------\t-------"
|
70
|
-
end
|
71
|
-
|
72
|
-
end
|
73
|
-
end
|