imw 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +194 -31
- data/VERSION +1 -1
- data/bin/imw +5 -0
- data/lib/imw/boot.rb +0 -15
- data/lib/imw/dataset/paths.rb +38 -0
- data/lib/imw/dataset/task.rb +21 -18
- data/lib/imw/dataset/workflow.rb +126 -65
- data/lib/imw/dataset.rb +56 -82
- data/lib/imw/files/basicfile.rb +3 -3
- data/lib/imw/files/compressed_files_and_archives.rb +23 -37
- data/lib/imw/files/csv.rb +2 -1
- data/lib/imw/files/directory.rb +62 -0
- data/lib/imw/files/excel.rb +84 -0
- data/lib/imw/files/sgml.rb +4 -23
- data/lib/imw/files.rb +62 -47
- data/lib/imw/packagers/archiver.rb +19 -1
- data/lib/imw/packagers/s3_mover.rb +8 -0
- data/lib/imw/parsers/html_parser/matchers.rb +251 -268
- data/lib/imw/parsers/html_parser.rb +181 -176
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +35 -0
- data/lib/imw/runner.rb +114 -0
- data/lib/imw/utils/extensions/core.rb +0 -16
- data/lib/imw/utils/paths.rb +0 -28
- data/lib/imw.rb +21 -32
- metadata +11 -19
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
- data/lib/imw/dataset/datamapper.rb +0 -66
- data/lib/imw/dataset/loaddump.rb +0 -50
- data/lib/imw/dataset/old/file_collection.rb +0 -88
- data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
- data/lib/imw/dataset/scaffold.rb +0 -132
- data/lib/imw/dataset/scraped_uri.rb +0 -305
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
- data/lib/imw/dataset/scrub/scrub.rb +0 -147
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
- data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
- data/lib/imw/dataset/scrub/slug.rb +0 -101
- data/lib/imw/dataset/stats/counter.rb +0 -23
- data/lib/imw/dataset/stats.rb +0 -73
@@ -1,101 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'addressable/uri'
|
3
|
-
require 'uuidtools'
|
4
|
-
require 'scrub'
|
5
|
-
require 'scrub_simple_url'
|
6
|
-
|
7
|
-
module IMW
|
8
|
-
|
9
|
-
#
|
10
|
-
#
|
11
|
-
# +handle+ -- reasonable effort at a uniq-ish, but human-comprehensible string
|
12
|
-
# Handle should only contain the characters A-Za-z0-9_-./
|
13
|
-
#
|
14
|
-
#
|
15
|
-
class Slug
|
16
|
-
# A humane representation of the handle ('that-one-time-at_foo')
|
17
|
-
attr_reader :handle
|
18
|
-
# The purportedly unique string ('')
|
19
|
-
attr_accessor :uniqish
|
20
|
-
|
21
|
-
def initialize handle
|
22
|
-
self.handle = handle
|
23
|
-
self.uniqish = handle
|
24
|
-
end
|
25
|
-
|
26
|
-
#
|
27
|
-
# Unless overridden, use the uniqish to
|
28
|
-
# make a name-based UUID within the infochimps.org
|
29
|
-
# namespace
|
30
|
-
#
|
31
|
-
def uuid
|
32
|
-
UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
|
33
|
-
end
|
34
|
-
|
35
|
-
# Handle with only \w characters -- safe for everything there be
|
36
|
-
def url_sane
|
37
|
-
return '' if !handle
|
38
|
-
handle.gsub(/[^\w\/\:]+/, '-').gsub(/_/, '__').gsub(%r{[/:]+}, '_')
|
39
|
-
end
|
40
|
-
|
41
|
-
def handle= t
|
42
|
-
@handle = self.class.sanitize_handle(t)
|
43
|
-
end
|
44
|
-
|
45
|
-
# Strip all but handle-safe characters
|
46
|
-
def self.sanitize_handle t, turd='-'
|
47
|
-
t = t.gsub(%r{[^\w\-\./]+}, turd)
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
#
|
52
|
-
# Uses a URL (that's locator, not URI) as a
|
53
|
-
# presumed-uniq identifier.
|
54
|
-
#
|
55
|
-
# +uniqish+ returns the full normalized URL
|
56
|
-
#
|
57
|
-
# +handle+ is formed from the dot-reversed host, the scheme (if not http) and a
|
58
|
-
# sanitized version of the path. (The query string, fragment, etc are stripped
|
59
|
-
# from the handle)
|
60
|
-
#
|
61
|
-
#
|
62
|
-
class URLSlug < Slug
|
63
|
-
attr_accessor :url
|
64
|
-
def initialize url_str
|
65
|
-
self.url = Addressable::URI.heuristic_parse(url_str).normalize
|
66
|
-
raise "Bad URL #{url}" unless url.host
|
67
|
-
self.uniqish = url.to_s
|
68
|
-
self.handle = munge_url
|
69
|
-
end
|
70
|
-
|
71
|
-
def uuid
|
72
|
-
UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
module Sluggable
|
80
|
-
protected
|
81
|
-
def create_slug
|
82
|
-
"Slugging #{self.attributes}"
|
83
|
-
if (self.class.slug_on == :url) || (self.name.blank?)
|
84
|
-
slug = IMW::URLSlug.new(self.url)
|
85
|
-
self.name = slug.handle
|
86
|
-
else
|
87
|
-
slug = IMW::Slug.new(self.name)
|
88
|
-
end
|
89
|
-
self.handle ||= slug.handle
|
90
|
-
end
|
91
|
-
public
|
92
|
-
|
93
|
-
def self.included base
|
94
|
-
base.before :save, :create_slug
|
95
|
-
base.class_eval do
|
96
|
-
def self.slug_on s=nil
|
97
|
-
@slug_on ||= s
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|
101
|
-
end
|
@@ -1,23 +0,0 @@
|
|
1
|
-
module IMW
|
2
|
-
class RecordCounter < Hash
|
3
|
-
def record val
|
4
|
-
self[val] ||= 0
|
5
|
-
self[val] += 1
|
6
|
-
end
|
7
|
-
|
8
|
-
def if_seen val, &block
|
9
|
-
if self[val]
|
10
|
-
yield
|
11
|
-
end
|
12
|
-
record val
|
13
|
-
end
|
14
|
-
|
15
|
-
def unless_seen val, &block
|
16
|
-
unless self[val]
|
17
|
-
yield
|
18
|
-
end
|
19
|
-
record val
|
20
|
-
end
|
21
|
-
|
22
|
-
end
|
23
|
-
end
|
data/lib/imw/dataset/stats.rb
DELETED
@@ -1,73 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/dataset/stats.rb -- statistics for datasets
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# Implements methods to calculate very basic statistical properties of
|
7
|
-
# a dataset.
|
8
|
-
#
|
9
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
10
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
-
# License:: GPL 3.0
|
12
|
-
# Website:: http://infinitemonkeywrench.org/
|
13
|
-
#
|
14
|
-
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
15
|
-
|
16
|
-
|
17
|
-
module IMW
|
18
|
-
class Dataset
|
19
|
-
#
|
20
|
-
# simple histogram
|
21
|
-
#
|
22
|
-
# Runs down one column/attribute of a dataset
|
23
|
-
# returning counts for that column
|
24
|
-
#
|
25
|
-
def hist slicer
|
26
|
-
counts = { }
|
27
|
-
els = slice(slicer)
|
28
|
-
els.each do |el|
|
29
|
-
counts[el] ||= 0
|
30
|
-
counts[el] += 1
|
31
|
-
end
|
32
|
-
self.class.new(counts.map{ |el,ct| [ct,el] })
|
33
|
-
end
|
34
|
-
|
35
|
-
def slice slicer
|
36
|
-
case
|
37
|
-
when slicer.respond_to?(:call) then self.map{ |row| slicer.call(row) }
|
38
|
-
else
|
39
|
-
self.map{ |row| row[slicer] }
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
#
|
44
|
-
# Report
|
45
|
-
#
|
46
|
-
def report slicer, opts={}
|
47
|
-
opts.reverse_merge! :n_top => 20, :hist_args => [], :fmt => "%7d\t%s", :do_hist => true
|
48
|
-
counts = hist(slicer)
|
49
|
-
report_hist data, counts, slicer, opts if opts[:do_hist]
|
50
|
-
report_sizes data, counts, slicer, opts
|
51
|
-
end
|
52
|
-
|
53
|
-
def report_sizes data, counts, slicer, opts={}
|
54
|
-
fmt = opts[:fmt]
|
55
|
-
puts fmt % [counts.length, "unique elements"]
|
56
|
-
puts fmt % [data.length, "total elements"]
|
57
|
-
puts fmt % [counts.find_all(&:nil?).length, "nil elements"]
|
58
|
-
uniqvals = counts.map{|ct,el| el}.reject(&:nil?)
|
59
|
-
puts " min:\t#{uniqvals.min}"
|
60
|
-
puts " max:\t#{uniqvals.max}"
|
61
|
-
end
|
62
|
-
|
63
|
-
# Most popular
|
64
|
-
def report_hist data, counts, slicer, opts={}
|
65
|
-
top = counts.sort_by{|ct,el| ct}[-opts[:n_top]..-1]
|
66
|
-
puts "Top #{opts[:n_top]} elements for slice through #{slicer}:"
|
67
|
-
puts " -freq-\t-element-"
|
68
|
-
puts top.map{ |ct,el| opts[:fmt] % [ct,el] }
|
69
|
-
puts "-------\t-------"
|
70
|
-
end
|
71
|
-
|
72
|
-
end
|
73
|
-
end
|