imw 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/README.rdoc +194 -31
  2. data/VERSION +1 -1
  3. data/bin/imw +5 -0
  4. data/lib/imw/boot.rb +0 -15
  5. data/lib/imw/dataset/paths.rb +38 -0
  6. data/lib/imw/dataset/task.rb +21 -18
  7. data/lib/imw/dataset/workflow.rb +126 -65
  8. data/lib/imw/dataset.rb +56 -82
  9. data/lib/imw/files/basicfile.rb +3 -3
  10. data/lib/imw/files/compressed_files_and_archives.rb +23 -37
  11. data/lib/imw/files/csv.rb +2 -1
  12. data/lib/imw/files/directory.rb +62 -0
  13. data/lib/imw/files/excel.rb +84 -0
  14. data/lib/imw/files/sgml.rb +4 -23
  15. data/lib/imw/files.rb +62 -47
  16. data/lib/imw/packagers/archiver.rb +19 -1
  17. data/lib/imw/packagers/s3_mover.rb +8 -0
  18. data/lib/imw/parsers/html_parser/matchers.rb +251 -268
  19. data/lib/imw/parsers/html_parser.rb +181 -176
  20. data/lib/imw/parsers.rb +1 -1
  21. data/lib/imw/repository.rb +35 -0
  22. data/lib/imw/runner.rb +114 -0
  23. data/lib/imw/utils/extensions/core.rb +0 -16
  24. data/lib/imw/utils/paths.rb +0 -28
  25. data/lib/imw.rb +21 -32
  26. metadata +11 -19
  27. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
  28. data/lib/imw/dataset/datamapper.rb +0 -66
  29. data/lib/imw/dataset/loaddump.rb +0 -50
  30. data/lib/imw/dataset/old/file_collection.rb +0 -88
  31. data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
  32. data/lib/imw/dataset/scaffold.rb +0 -132
  33. data/lib/imw/dataset/scraped_uri.rb +0 -305
  34. data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
  35. data/lib/imw/dataset/scrub/scrub.rb +0 -147
  36. data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
  37. data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
  38. data/lib/imw/dataset/scrub/slug.rb +0 -101
  39. data/lib/imw/dataset/stats/counter.rb +0 -23
  40. data/lib/imw/dataset/stats.rb +0 -73
@@ -1,101 +0,0 @@
1
- require 'rubygems'
2
- require 'addressable/uri'
3
- require 'uuidtools'
4
- require 'scrub'
5
- require 'scrub_simple_url'
6
-
7
- module IMW
8
-
9
- #
10
- #
11
- # +handle+ -- reasonable effort at a uniq-ish, but human-comprehensible string
12
- # Handle should only contain the characters A-Za-z0-9_-./
13
- #
14
- #
15
- class Slug
16
- # A humane representation of the handle ('that-one-time-at_foo')
17
- attr_reader :handle
18
- # The purportedly unique string ('')
19
- attr_accessor :uniqish
20
-
21
- def initialize handle
22
- self.handle = handle
23
- self.uniqish = handle
24
- end
25
-
26
- #
27
- # Unless overridden, use the uniqish to
28
- # make a name-based UUID within the infochimps.org
29
- # namespace
30
- #
31
- def uuid
32
- UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
33
- end
34
-
35
- # Handle with only \w characters -- safe for everything there be
36
- def url_sane
37
- return '' if !handle
38
- handle.gsub(/[^\w\/\:]+/, '-').gsub(/_/, '__').gsub(%r{[/:]+}, '_')
39
- end
40
-
41
- def handle= t
42
- @handle = self.class.sanitize_handle(t)
43
- end
44
-
45
- # Strip all but handle-safe characters
46
- def self.sanitize_handle t, turd='-'
47
- t = t.gsub(%r{[^\w\-\./]+}, turd)
48
- end
49
- end
50
-
51
- #
52
- # Uses a URL (that's locator, not URI) as a
53
- # presumed-uniq identifier.
54
- #
55
- # +uniqish+ returns the full normalized URL
56
- #
57
- # +handle+ is formed from the dot-reversed host, the scheme (if not http) and a
58
- # sanitized version of the path. (The query string, fragment, etc are stripped
59
- # from the handle)
60
- #
61
- #
62
- class URLSlug < Slug
63
- attr_accessor :url
64
- def initialize url_str
65
- self.url = Addressable::URI.heuristic_parse(url_str).normalize
66
- raise "Bad URL #{url}" unless url.host
67
- self.uniqish = url.to_s
68
- self.handle = munge_url
69
- end
70
-
71
- def uuid
72
- UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
73
- end
74
- end
75
- end
76
-
77
-
78
-
79
- module Sluggable
80
- protected
81
- def create_slug
82
- "Slugging #{self.attributes}"
83
- if (self.class.slug_on == :url) || (self.name.blank?)
84
- slug = IMW::URLSlug.new(self.url)
85
- self.name = slug.handle
86
- else
87
- slug = IMW::Slug.new(self.name)
88
- end
89
- self.handle ||= slug.handle
90
- end
91
- public
92
-
93
- def self.included base
94
- base.before :save, :create_slug
95
- base.class_eval do
96
- def self.slug_on s=nil
97
- @slug_on ||= s
98
- end
99
- end
100
- end
101
- end
@@ -1,23 +0,0 @@
1
- module IMW
2
- class RecordCounter < Hash
3
- def record val
4
- self[val] ||= 0
5
- self[val] += 1
6
- end
7
-
8
- def if_seen val, &block
9
- if self[val]
10
- yield
11
- end
12
- record val
13
- end
14
-
15
- def unless_seen val, &block
16
- unless self[val]
17
- yield
18
- end
19
- record val
20
- end
21
-
22
- end
23
- end
@@ -1,73 +0,0 @@
1
- #
2
- # h2. lib/imw/dataset/stats.rb -- statistics for datasets
3
- #
4
- # == About
5
- #
6
- # Implements methods to calculate very basic statistical properties of
7
- # a dataset.
8
- #
9
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
- # Copyright:: Copyright (c) 2008 infochimps.org
11
- # License:: GPL 3.0
12
- # Website:: http://infinitemonkeywrench.org/
13
- #
14
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
-
16
-
17
- module IMW
18
- class Dataset
19
- #
20
- # simple histogram
21
- #
22
- # Runs down one column/attribute of a dataset
23
- # returning counts for that column
24
- #
25
- def hist slicer
26
- counts = { }
27
- els = slice(slicer)
28
- els.each do |el|
29
- counts[el] ||= 0
30
- counts[el] += 1
31
- end
32
- self.class.new(counts.map{ |el,ct| [ct,el] })
33
- end
34
-
35
- def slice slicer
36
- case
37
- when slicer.respond_to?(:call) then self.map{ |row| slicer.call(row) }
38
- else
39
- self.map{ |row| row[slicer] }
40
- end
41
- end
42
-
43
- #
44
- # Report
45
- #
46
- def report slicer, opts={}
47
- opts.reverse_merge! :n_top => 20, :hist_args => [], :fmt => "%7d\t%s", :do_hist => true
48
- counts = hist(slicer)
49
- report_hist data, counts, slicer, opts if opts[:do_hist]
50
- report_sizes data, counts, slicer, opts
51
- end
52
-
53
- def report_sizes data, counts, slicer, opts={}
54
- fmt = opts[:fmt]
55
- puts fmt % [counts.length, "unique elements"]
56
- puts fmt % [data.length, "total elements"]
57
- puts fmt % [counts.find_all(&:nil?).length, "nil elements"]
58
- uniqvals = counts.map{|ct,el| el}.reject(&:nil?)
59
- puts " min:\t#{uniqvals.min}"
60
- puts " max:\t#{uniqvals.max}"
61
- end
62
-
63
- # Most popular
64
- def report_hist data, counts, slicer, opts={}
65
- top = counts.sort_by{|ct,el| ct}[-opts[:n_top]..-1]
66
- puts "Top #{opts[:n_top]} elements for slice through #{slicer}:"
67
- puts " -freq-\t-element-"
68
- puts top.map{ |ct,el| opts[:fmt] % [ct,el] }
69
- puts "-------\t-------"
70
- end
71
-
72
- end
73
- end