imw 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/README.rdoc +194 -31
  2. data/VERSION +1 -1
  3. data/bin/imw +5 -0
  4. data/lib/imw/boot.rb +0 -15
  5. data/lib/imw/dataset/paths.rb +38 -0
  6. data/lib/imw/dataset/task.rb +21 -18
  7. data/lib/imw/dataset/workflow.rb +126 -65
  8. data/lib/imw/dataset.rb +56 -82
  9. data/lib/imw/files/basicfile.rb +3 -3
  10. data/lib/imw/files/compressed_files_and_archives.rb +23 -37
  11. data/lib/imw/files/csv.rb +2 -1
  12. data/lib/imw/files/directory.rb +62 -0
  13. data/lib/imw/files/excel.rb +84 -0
  14. data/lib/imw/files/sgml.rb +4 -23
  15. data/lib/imw/files.rb +62 -47
  16. data/lib/imw/packagers/archiver.rb +19 -1
  17. data/lib/imw/packagers/s3_mover.rb +8 -0
  18. data/lib/imw/parsers/html_parser/matchers.rb +251 -268
  19. data/lib/imw/parsers/html_parser.rb +181 -176
  20. data/lib/imw/parsers.rb +1 -1
  21. data/lib/imw/repository.rb +35 -0
  22. data/lib/imw/runner.rb +114 -0
  23. data/lib/imw/utils/extensions/core.rb +0 -16
  24. data/lib/imw/utils/paths.rb +0 -28
  25. data/lib/imw.rb +21 -32
  26. metadata +11 -19
  27. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
  28. data/lib/imw/dataset/datamapper.rb +0 -66
  29. data/lib/imw/dataset/loaddump.rb +0 -50
  30. data/lib/imw/dataset/old/file_collection.rb +0 -88
  31. data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
  32. data/lib/imw/dataset/scaffold.rb +0 -132
  33. data/lib/imw/dataset/scraped_uri.rb +0 -305
  34. data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
  35. data/lib/imw/dataset/scrub/scrub.rb +0 -147
  36. data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
  37. data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
  38. data/lib/imw/dataset/scrub/slug.rb +0 -101
  39. data/lib/imw/dataset/stats/counter.rb +0 -23
  40. data/lib/imw/dataset/stats.rb +0 -73
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: imw
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dhruv Bansal
@@ -10,14 +10,14 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2009-12-20 00:00:00 -06:00
14
- default_executable:
13
+ date: 2010-02-02 00:00:00 -06:00
14
+ default_executable: imw
15
15
  dependencies: []
16
16
 
17
17
  description: The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the tasks of acquiring, extracting, transforming, loading, and packaging data. It minimizes programmer time by encapsulating common data workflows and patterns and creating interfaces to many other useful Ruby libraries.
18
18
  email: coders@infochimps.org
19
- executables: []
20
-
19
+ executables:
20
+ - imw
21
21
  extensions: []
22
22
 
23
23
  extra_rdoc_files:
@@ -30,24 +30,12 @@ files:
30
30
  - README.rdoc
31
31
  - Rakefile
32
32
  - VERSION
33
+ - bin/imw
33
34
  - etc/imwrc.rb
34
35
  - lib/imw.rb
35
36
  - lib/imw/boot.rb
36
37
  - lib/imw/dataset.rb
37
- - lib/imw/dataset/datamapper.rb
38
- - lib/imw/dataset/datamapper/time_and_user_stamps.rb
39
- - lib/imw/dataset/loaddump.rb
40
- - lib/imw/dataset/old/file_collection.rb
41
- - lib/imw/dataset/old/file_collection_utils.rb
42
- - lib/imw/dataset/scaffold.rb
43
- - lib/imw/dataset/scraped_uri.rb
44
- - lib/imw/dataset/scrub/old_working_scrubber.rb
45
- - lib/imw/dataset/scrub/scrub.rb
46
- - lib/imw/dataset/scrub/scrub_simple_url.rb
47
- - lib/imw/dataset/scrub/scrub_test.rb
48
- - lib/imw/dataset/scrub/slug.rb
49
- - lib/imw/dataset/stats.rb
50
- - lib/imw/dataset/stats/counter.rb
38
+ - lib/imw/dataset/paths.rb
51
39
  - lib/imw/dataset/task.rb
52
40
  - lib/imw/dataset/workflow.rb
53
41
  - lib/imw/files.rb
@@ -58,6 +46,8 @@ files:
58
46
  - lib/imw/files/compressed_files_and_archives.rb
59
47
  - lib/imw/files/compressible.rb
60
48
  - lib/imw/files/csv.rb
49
+ - lib/imw/files/directory.rb
50
+ - lib/imw/files/excel.rb
61
51
  - lib/imw/files/json.rb
62
52
  - lib/imw/files/sgml.rb
63
53
  - lib/imw/files/text.rb
@@ -70,6 +60,8 @@ files:
70
60
  - lib/imw/parsers/html_parser/matchers.rb
71
61
  - lib/imw/parsers/line_parser.rb
72
62
  - lib/imw/parsers/regexp_parser.rb
63
+ - lib/imw/repository.rb
64
+ - lib/imw/runner.rb
73
65
  - lib/imw/utils.rb
74
66
  - lib/imw/utils/components.rb
75
67
  - lib/imw/utils/config.rb
@@ -1,37 +0,0 @@
1
- require 'rubygems'
2
-
3
- # gem 'dm-core', '=0.9.6'
4
- require 'dm-core'
5
-
6
- #
7
- # Stolen from http://github.com/sam/dm-more/tree/master/dm-timestamps/lib/dm-timestamps.rb
8
- #
9
-
10
- module DataMapper
11
- module Timestamp
12
- TIMESTAMP_PROPERTIES = {
13
- :updated_at => lambda { |r| r.updated_at = DateTime.now },
14
- :updated_on => lambda { |r| r.updated_on = Date.today },
15
- :updated_by => lambda { |r| r.updated_by = IMW::USER_INFO[:id] },
16
- :created_at => lambda { |r| r.created_at = DateTime.now if r.new_record? && r.created_at.nil? },
17
- :created_on => lambda { |r| r.created_on = Date.today if r.new_record? && r.created_on.nil?},
18
- :created_by => lambda { |r| r.created_by = IMW::USER_INFO[:id] if r.new_record? && r.created_by.blank?},
19
- }
20
-
21
- def self.included(model)
22
- model.before :save, :set_timestamp_properties
23
- end
24
-
25
- private
26
-
27
- def set_timestamp_properties
28
- if dirty?
29
- self.class.properties.slice(*TIMESTAMP_PROPERTIES.keys).compact.each do |property|
30
- TIMESTAMP_PROPERTIES[property.name][self] unless attribute_dirty?(property.name)
31
- end
32
- end
33
- end
34
- end # module Timestamp
35
-
36
- Resource::append_inclusions Timestamp
37
- end
@@ -1,66 +0,0 @@
1
- #
2
- # h2. lib/imw/dataset/datamapper.rb -- extensions to datamapper for datasets
3
- #
4
- # == About
5
- #
6
- # The DataMapper[http://datamapper.org/] library is an ORM for Ruby
7
- # which is lighter than ActiveRecord[http://ar.rubyonrails.com/] and
8
- # the like. It is the ORM that IMW is designed to work natively with.
9
- #
10
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
11
- # Copyright:: Copyright (c) 2008 infochimps.org
12
- # License:: GPL 3.0
13
- # Website:: http://infinitemonkeywrench.org/
14
- #
15
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
16
-
17
- require 'imw/utils'
18
- require 'dm-core'
19
- require 'dm-ar-finders'
20
- require 'dm-aggregates'
21
- require 'dm-serializer'
22
-
23
- module DataMapper
24
- # Connect to a remote database
25
- def self.setup_remote_connection options
26
- options = { :handle => :default }.merge options
27
- params = options.values_at(:protocol, :username, :password, :hostname, :dbname)
28
- DataMapper.setup(options[:handle], "%s://%s:%s@%s/%s" % params)
29
- end
30
- # Connect to a local database
31
- def self.setup_local_connection options
32
- options = { :handle => :default }.merge options
33
- params = options.values_at(:protocol, :dbpath, :dbname)
34
- DataMapper.setup(options[:handle], "%s://%s/%s" % params)
35
- end
36
-
37
- # KLUDGE
38
- def self.open_repositories repository_dbnames, params
39
- repository_dbnames.each do |handle, dbname|
40
- repo_params = params.merge({ :handle => handle, :dbname => dbname })
41
- DataMapper.setup_remote_connection repo_params
42
- end
43
- end
44
-
45
-
46
- module Model
47
-
48
- # Find or create the resource matching search attributes and in
49
- # either case set the update-able attributes.
50
- def update_or_create(search_attributes, updateable_attributes = {})
51
- if (resource = first(search_attributes))
52
- resource.update_attributes updateable_attributes
53
- else
54
- resource = create(search_attributes.merge(updateable_attributes))
55
- end
56
- resource
57
- end
58
-
59
- end
60
-
61
- # watch SQL log -- must be BEFORE call to db setup
62
- def self.logging=(verbosity)
63
- verbosity = :debug if (verbosity == true)
64
- DataMapper::Logger.new(STDERR, verbosity) if verbosity
65
- end
66
- end
@@ -1,50 +0,0 @@
1
- #
2
- # h2. lib/imw/dataset/loaddump.rb -- read and write datasets to resources
3
- #
4
- # == About
5
- #
6
- # Implements methods to load a dataset from a resource and to write a
7
- # dataset back to a resource.
8
- #
9
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
- # Copyright:: Copyright (c) 2008 infochimps.org
11
- # License:: GPL 3.0
12
- # Website:: http://infinitemonkeywrench.org/
13
- #
14
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
-
16
- require 'imw/utils'
17
-
18
- module IMW
19
- class Dataset
20
-
21
- # Return the data in +filename+ in an appropriate form.
22
- #
23
- # FIXME How do I get pass a block from one method to another?
24
- def self.load filename, &block
25
- filename = path_to(filename)
26
- announce "Loading #{filename}"
27
- file = IMW.open(filename)
28
- data = file.load(filename)
29
- if block
30
- data.each{|record| yield record}
31
- file
32
- else
33
- data
34
- end
35
- end
36
-
37
- # Dump +data+ to +filename+.
38
- def self.dump data, filename
39
- filename = path_to(filename)
40
- announce "Dumping to #{filename}"
41
- IMW.open(filename,'w').dump(data)
42
- end
43
-
44
- # Dispatch to <tt>Dataset.dump</tt>.
45
- def dump filename
46
- self.class.dump self.data, *args
47
- end
48
-
49
- end
50
- end
@@ -1,88 +0,0 @@
1
- require 'imw/dataset'
2
- require 'imw/dataset/uri'
3
-
4
- #
5
- # All the files associated with a given URL
6
- #
7
- class DatasetFileCollection
8
- include DataMapper::Resource
9
- property :id, Integer, :serial => true
10
- property :category, String, :nullable => false, :unique_index => :category
11
- has n, :ripped_file_collections
12
-
13
- end
14
-
15
- #
16
- # Collection of raw files retrieved from a spider based at a given URL
17
- #
18
- class RippedFileCollection
19
- include DataMapper::Resource
20
- property :id, Integer, :serial => true
21
- belongs_to :url, :class_name => DM_URI, :child_key => [:url_id]
22
- has n, :ripped_files
23
- belongs_to :dataset_file_collection
24
-
25
- def self.find_or_create_from_url url, dataset_file_collection
26
- url = DM_URI.find_or_create_from_url(url)
27
- ripdfiles = self.find_or_create(
28
- { :url_id => url.id },
29
- { :dataset_file_collection => dataset_file_collection})
30
- end
31
-
32
- def listing_filename()
33
- path_to(:rawd, "listing-#{url.as_flat_filename}.txt")
34
- end
35
-
36
- def make_listing_file
37
- return if File.exist?(listing_filename)
38
- FileUtils.cd path_to(:ripd_root) do
39
- `find #{url.as_path} > #{listing_filename}`
40
- end
41
- end
42
-
43
- # Mon Aug 11 08:59:00 -0500 2008 files: 0
44
- # Mon Aug 11 09:05:34 -0500 2008 files: 100000 => so, 1M files/hr. not good.
45
- def index_from_listing
46
- make_listing_file
47
- self.ripped_files
48
- FileUtils.cd path_to(:ripd_root) do
49
- File.foreach(listing_filename) do |full_path|
50
- track_count :files
51
- full_path.chomp!
52
- ripd_path = full_path[1+url.as_path.length..-1]
53
- next if ripd_path.blank?
54
- RippedFile.from_file(self, full_path, ripd_path)
55
- end
56
- end
57
- self.save
58
- end
59
- end
60
-
61
- #
62
- # Index the raw files retrieved from website
63
- #
64
- class RippedFile
65
- include DataMapper::Resource
66
- property :id, Integer, :serial => true
67
- property :ripped_file_collection_id, Integer, :unique_index => :ripd_path
68
- property :ripd_path, String, :length => 255, :nullable => false, :unique_index => :ripd_path
69
- property :retrieval_date, DateTime
70
- property :compressed_size, Integer
71
- belongs_to :ripped_file_collection
72
-
73
- def self.from_file clxn, full_path, ripd_path
74
- filedate = File.mtime(full_path)
75
- filesize = File.size( full_path)
76
- ripped_file = self.find_or_create({ :ripd_path => ripd_path }, {
77
- :ripped_file_collection => clxn,
78
- :retrieval_date => filedate,
79
- :compressed_size => filesize,
80
- })
81
- ripped_file
82
- end
83
-
84
- end
85
-
86
- # SELECT r.*, u.host, u.path FROM ripped_files r
87
- # LEFT JOIN ripped_file_collections rfs ON r.ripped_file_collection_id = rfs.id
88
- # LEFT JOIN dm_uris u ON rfs.url_id = u.id
@@ -1,71 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require 'imw/utils'; include IMW
3
- require 'imw/dataset/file_collection'
4
- require 'tempfile'
5
-
6
- def bulk_listing_filename() '/tmp/listing_foo.txt' end
7
- def table_name() 'ripped_files' end
8
-
9
- def run_mysql_cmd db_params, cmd
10
- username, password, hostname, dbname = db_params.values_at(:username, :password, :hostname, :dbname)
11
- query_file = Tempfile.new("qlstg")
12
- query_file.puts cmd
13
- query_file.close
14
- puts `time mysql -E -u#{username} -p#{password} -h#{hostname} #{dbname} < #{query_file.path}`
15
- end
16
-
17
- def bulk_load_mysql db_params, ripd_base
18
- announce "Calling mysql to bulk load #{ripd_base} (expect ~2s per 100k files)"
19
- run_mysql_cmd db_params, %Q{
20
- LOAD DATA LOCAL INFILE '#{bulk_listing_filename}'
21
- REPLACE INTO TABLE `#{table_name}`
22
- FIELDS TERMINATED BY ','
23
- LINES TERMINATED BY '\n'
24
- (`ripped_file_collection_id`, `ripd_path`, `retrieval_date`, `compressed_size`)
25
- ;
26
- }
27
- end
28
-
29
- def clear_table
30
- run_mysql_cmd "TRUNCATE #{table_name}"
31
- end
32
-
33
- class RippedFileCollection
34
- def bulk_load_listing db_params, extra_find_args=""
35
- announce "Indexing #{url.as_path} (expect ~10s per 100k files)"
36
- FileUtils.cd path_to(:ripd_root) do
37
- find_fmt = "#{self.id},%P,%TY-%Tm-%Td %TH:%TM:%TS,%s\n"
38
- find_cmd = "find #{url.as_path} #{extra_find_args} -printf '#{find_fmt}' > #{bulk_listing_filename}"
39
- puts `time #{find_cmd}`
40
- end unless File.exist?(bulk_listing_filename)
41
- bulk_load_mysql db_params, url.as_path
42
- end
43
- end
44
-
45
-
46
- # SELECT rf_yrs.*, dfc.*, url.scheme, url.host, url.path
47
- # FROM (
48
- # SELECT SUBSTR(ripd_path,1,4) AS yr, COUNT(*), r.*
49
- # FROM ripped_files r
50
- # GROUP BY ripped_file_collection_id, yr
51
- # ORDER BY ripped_file_collection_id, yr
52
- # ) rf_yrs
53
- # LEFT JOIN ripped_file_collections rfc ON rfc.id = rf_yrs.ripped_file_collection_id
54
- # LEFT JOIN dataset_file_collections dfc ON dfc.id = rfc.dataset_file_collection_id
55
- # LEFT JOIN dm_uris url ON url.id = rfc.url_id
56
-
57
- db_params = IMW::DEFAULT_DATABASE_CONNECTION_PARAMS.merge({ :dbname => 'imw_weather_ncdc' })
58
- IMW::Dataset.setup_remote_connection db_params
59
-
60
- # Daily
61
- daily_dset_clxn = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/daily' })
62
- rf_clxn = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/gsod', daily_dset_clxn
63
- rf_clxn.bulk_load_listing db_params
64
- # Hourly
65
- hourly_dset_clxn = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/hourly' })
66
- rf_clxn = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/noaa', hourly_dset_clxn
67
- rf_clxn.bulk_load_listing db_params, '\\! \\( -iname "isd-lite" -prune \\) '
68
- # Hourly-lite
69
- hlite_dset_clxn = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/hourly_lite' })
70
- rf_clxn = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-lite', hlite_dset_clxn
71
- rf_clxn.bulk_load_listing db_params
@@ -1,132 +0,0 @@
1
- #
2
- # h2. lib/imw/workflow/scaffold.rb -- scaffold the directory structure for a dataset
3
- #
4
- # == About
5
- #
6
- # Defines workflow tasks for datasets to create directories and
7
- # symlinks to ease the processing of a dataset.
8
- #
9
- # Right now this file contains code written by Flip as well as code
10
- # written by Dhruv which accomplish basically the same task. Dhruv's
11
- # code integrates with <tt>IMW::Dataset</tt> and Rake and should be
12
- # used preferentially.
13
- #
14
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
15
- # Copyright:: Copyright (c) 2008 infochimps.org
16
- # License:: GPL 3.0
17
- # Website:: http://infinitemonkeywrench.org/
18
- #
19
- # puts "#{File.basename(__FILE__)}: POST NO BILLS. Is that funny to anyone but me? No?" # at bottom
20
-
21
- require 'rake'
22
- require 'fileutils'
23
-
24
- require 'imw/utils'
25
- require 'imw/dataset/task'
26
-
27
- include FileUtils
28
-
29
- module IMW
30
- include FileUtils
31
-
32
- ################################################################
33
- ## FLIP'S CODE
34
- ################################################################
35
-
36
- def scaffold_script_dirs
37
- mkdir_p path_to(:me)
38
- end
39
-
40
- #
41
- # * creates a directory for the dataset in each of the top-level hierarchies
42
- # (as given in ~/.imwrc)
43
- # * links to that directory within the working directory
44
- # in directory pool/foo/bar/baz we'd find
45
- # rawd => /data/rawd/foo/bar/baz
46
- #
47
- def scaffold_dset_dirs
48
- [:rawd, :temp, :fixd, :log].each do |seg|
49
- unless File.exist?(path_to(seg))
50
- seg_dir = path_to(pathseg_root(seg), :dset)
51
- mkdir_p seg_dir
52
- ln_s seg_dir, path_to(seg)
53
- end
54
- end
55
- end
56
-
57
-
58
- #
59
- # * creates a symlink within the working directory to the
60
- # ripped directory, named after its url
61
- #
62
- def scaffold_rip_dir url
63
- unless File.exist?(path_to(seg))
64
- ripd_dir = path_to(:ripd_root, url)
65
- mkdir_p ripd_dir
66
- ln_s ripd_dir, path_to(:ripd)
67
- end
68
- end
69
-
70
- def scaffold_dset
71
- scaffold_script_dirs
72
- scaffold_dset_dirs
73
- end
74
-
75
-
76
- ################################################################
77
- ## DHRUV's CODE -- uses IMW::Dataset and Rake
78
- ################################################################
79
- module Workflow
80
-
81
- # Creates a workflow task <tt>:create_directories</tt> to create
82
- # the directory structure for this dataset.
83
- def create_directories_task
84
- @last_description = "Creates directories for this dataset in the peel through package steps."
85
- define_task(IMW::Task, {:create_directories => []}) do
86
- [:peel, :munge, :fix, :package].each do |step|
87
- FileUtils.mkdir_p(path_to(step)) unless File.exist?(path_to(step))
88
- end
89
- end
90
- end
91
-
92
- # Creates a workflow task <tt>:create_symlinks</tt> to create
93
- # the directory structure for this dataset.
94
- def create_symlinks_task
95
- @last_description = "Creates symlinks pointing from the directory containing scripts for this dataset to the directories for the peel through package steps."
96
- define_task(IMW::Task, {:create_symlinks => [:create_directories]}) do
97
- [:peel, :munge, :fix, :package].each do |step|
98
- symlink = File.join(path_to(:script),IMW::Dataset::WORKFLOW_STEP_DIRS[step].to_s)
99
- FileUtils.ln_s(path_to(step), symlink) unless File.exist?(symlink)
100
- end
101
- symlink = File.join(path_to(:script), "ripd")
102
- FileUtils.ln_s(path_to(:ripd_root), symlink) unless File.exist?(symlink)
103
- end
104
- end
105
-
106
- # Creates a task <tt>:initialize</tt> which does nothing but
107
- # depends upon all the tasks required to initialize the dataset.
108
- def create_initialize_task
109
- @last_description = "Set everything up to begin processing the dataset."
110
- define_task(IMW::Task, :initialize => [:create_directories, :create_symlinks])
111
- end
112
-
113
- # Removes all data for this dataset from the data directories.
114
- def create_delete_data_task
115
- @last_description = "Deletes all data and directories for this dataset for the peel through package steps."
116
- define_task(IMW::Task, {:delete_data => []}) do
117
- [:peel, :munge, :fix, :package].each do |step|
118
- FileUtils.remove_dir(path_to(step)) if File.exist?(path_to(step))
119
- end
120
- end
121
- end
122
-
123
- # Creates a task <tt>:destroy</tt> which does nothing but depends
124
- # upon all the tasks required to delete the dataset's data and
125
- # remove its footprint from IMW.
126
- def create_destroy_task
127
- @last_description = "Get rid of all traces of this dataset."
128
- define_task(IMW::Task, :destroy => [:delete_data])
129
- end
130
-
131
- end
132
- end