imw 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/README.rdoc +194 -31
  2. data/VERSION +1 -1
  3. data/bin/imw +5 -0
  4. data/lib/imw/boot.rb +0 -15
  5. data/lib/imw/dataset/paths.rb +38 -0
  6. data/lib/imw/dataset/task.rb +21 -18
  7. data/lib/imw/dataset/workflow.rb +126 -65
  8. data/lib/imw/dataset.rb +56 -82
  9. data/lib/imw/files/basicfile.rb +3 -3
  10. data/lib/imw/files/compressed_files_and_archives.rb +23 -37
  11. data/lib/imw/files/csv.rb +2 -1
  12. data/lib/imw/files/directory.rb +62 -0
  13. data/lib/imw/files/excel.rb +84 -0
  14. data/lib/imw/files/sgml.rb +4 -23
  15. data/lib/imw/files.rb +62 -47
  16. data/lib/imw/packagers/archiver.rb +19 -1
  17. data/lib/imw/packagers/s3_mover.rb +8 -0
  18. data/lib/imw/parsers/html_parser/matchers.rb +251 -268
  19. data/lib/imw/parsers/html_parser.rb +181 -176
  20. data/lib/imw/parsers.rb +1 -1
  21. data/lib/imw/repository.rb +35 -0
  22. data/lib/imw/runner.rb +114 -0
  23. data/lib/imw/utils/extensions/core.rb +0 -16
  24. data/lib/imw/utils/paths.rb +0 -28
  25. data/lib/imw.rb +21 -32
  26. metadata +11 -19
  27. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
  28. data/lib/imw/dataset/datamapper.rb +0 -66
  29. data/lib/imw/dataset/loaddump.rb +0 -50
  30. data/lib/imw/dataset/old/file_collection.rb +0 -88
  31. data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
  32. data/lib/imw/dataset/scaffold.rb +0 -132
  33. data/lib/imw/dataset/scraped_uri.rb +0 -305
  34. data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
  35. data/lib/imw/dataset/scrub/scrub.rb +0 -147
  36. data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
  37. data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
  38. data/lib/imw/dataset/scrub/slug.rb +0 -101
  39. data/lib/imw/dataset/stats/counter.rb +0 -23
  40. data/lib/imw/dataset/stats.rb +0 -73
@@ -1,101 +0,0 @@
1
- require 'rubygems'
2
- require 'addressable/uri'
3
- require 'uuidtools'
4
- require 'scrub'
5
- require 'scrub_simple_url'
6
-
7
- module IMW
8
-
9
- #
10
- #
11
- # +handle+ -- reasonable effort at a uniq-ish, but human-comprehensible string
12
- # Handle should only contain the characters A-Za-z0-9_-./
13
- #
14
- #
15
- class Slug
16
- # A humane representation of the handle ('that-one-time-at_foo')
17
- attr_reader :handle
18
- # The purportedly unique string ('')
19
- attr_accessor :uniqish
20
-
21
- def initialize handle
22
- self.handle = handle
23
- self.uniqish = handle
24
- end
25
-
26
- #
27
- # Unless overridden, use the uniqish to
28
- # make a name-based UUID within the infochimps.org
29
- # namespace
30
- #
31
- def uuid
32
- UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
33
- end
34
-
35
- # Handle with only \w characters -- safe for everything there be
36
- def url_sane
37
- return '' if !handle
38
- handle.gsub(/[^\w\/\:]+/, '-').gsub(/_/, '__').gsub(%r{[/:]+}, '_')
39
- end
40
-
41
- def handle= t
42
- @handle = self.class.sanitize_handle(t)
43
- end
44
-
45
- # Strip all but handle-safe characters
46
- def self.sanitize_handle t, turd='-'
47
- t = t.gsub(%r{[^\w\-\./]+}, turd)
48
- end
49
- end
50
-
51
- #
52
- # Uses a URL (that's locator, not URI) as a
53
- # presumed-uniq identifier.
54
- #
55
- # +uniqish+ returns the full normalized URL
56
- #
57
- # +handle+ is formed from the dot-reversed host, the scheme (if not http) and a
58
- # sanitized version of the path. (The query string, fragment, etc are stripped
59
- # from the handle)
60
- #
61
- #
62
- class URLSlug < Slug
63
- attr_accessor :url
64
- def initialize url_str
65
- self.url = Addressable::URI.heuristic_parse(url_str).normalize
66
- raise "Bad URL #{url}" unless url.host
67
- self.uniqish = url.to_s
68
- self.handle = munge_url
69
- end
70
-
71
- def uuid
72
- UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
73
- end
74
- end
75
- end
76
-
77
-
78
-
79
- module Sluggable
80
- protected
81
- def create_slug
82
- "Slugging #{self.attributes}"
83
- if (self.class.slug_on == :url) || (self.name.blank?)
84
- slug = IMW::URLSlug.new(self.url)
85
- self.name = slug.handle
86
- else
87
- slug = IMW::Slug.new(self.name)
88
- end
89
- self.handle ||= slug.handle
90
- end
91
- public
92
-
93
- def self.included base
94
- base.before :save, :create_slug
95
- base.class_eval do
96
- def self.slug_on s=nil
97
- @slug_on ||= s
98
- end
99
- end
100
- end
101
- end
@@ -1,23 +0,0 @@
1
- module IMW
2
- class RecordCounter < Hash
3
- def record val
4
- self[val] ||= 0
5
- self[val] += 1
6
- end
7
-
8
- def if_seen val, &block
9
- if self[val]
10
- yield
11
- end
12
- record val
13
- end
14
-
15
- def unless_seen val, &block
16
- unless self[val]
17
- yield
18
- end
19
- record val
20
- end
21
-
22
- end
23
- end
@@ -1,73 +0,0 @@
1
- #
2
- # h2. lib/imw/dataset/stats.rb -- statistics for datasets
3
- #
4
- # == About
5
- #
6
- # Implements methods to calculate very basic statistical properties of
7
- # a dataset.
8
- #
9
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
- # Copyright:: Copyright (c) 2008 infochimps.org
11
- # License:: GPL 3.0
12
- # Website:: http://infinitemonkeywrench.org/
13
- #
14
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
-
16
-
17
- module IMW
18
- class Dataset
19
- #
20
- # simple histogram
21
- #
22
- # Runs down one column/attribute of a dataset
23
- # returning counts for that column
24
- #
25
- def hist slicer
26
- counts = { }
27
- els = slice(slicer)
28
- els.each do |el|
29
- counts[el] ||= 0
30
- counts[el] += 1
31
- end
32
- self.class.new(counts.map{ |el,ct| [ct,el] })
33
- end
34
-
35
- def slice slicer
36
- case
37
- when slicer.respond_to?(:call) then self.map{ |row| slicer.call(row) }
38
- else
39
- self.map{ |row| row[slicer] }
40
- end
41
- end
42
-
43
- #
44
- # Report
45
- #
46
- def report slicer, opts={}
47
- opts.reverse_merge! :n_top => 20, :hist_args => [], :fmt => "%7d\t%s", :do_hist => true
48
- counts = hist(slicer)
49
- report_hist data, counts, slicer, opts if opts[:do_hist]
50
- report_sizes data, counts, slicer, opts
51
- end
52
-
53
- def report_sizes data, counts, slicer, opts={}
54
- fmt = opts[:fmt]
55
- puts fmt % [counts.length, "unique elements"]
56
- puts fmt % [data.length, "total elements"]
57
- puts fmt % [counts.find_all(&:nil?).length, "nil elements"]
58
- uniqvals = counts.map{|ct,el| el}.reject(&:nil?)
59
- puts " min:\t#{uniqvals.min}"
60
- puts " max:\t#{uniqvals.max}"
61
- end
62
-
63
- # Most popular
64
- def report_hist data, counts, slicer, opts={}
65
- top = counts.sort_by{|ct,el| ct}[-opts[:n_top]..-1]
66
- puts "Top #{opts[:n_top]} elements for slice through #{slicer}:"
67
- puts " -freq-\t-element-"
68
- puts top.map{ |ct,el| opts[:fmt] % [ct,el] }
69
- puts "-------\t-------"
70
- end
71
-
72
- end
73
- end