hackboxen 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,71 @@
1
+ require 'rubygems'
2
+ require 'configliere'
3
+ require 'rake'
4
+
5
+ hb_lib_dir = File.join(File.dirname(__FILE__), '../../../')
6
+ machine_config = '/etc/hackbox/hackbox.yaml'
7
+ install_config = File.join(ENV['HOME'], '.hackbox/hackbox.yaml')
8
+
9
+ Settings.use :commandline, :config_file
10
+ Settings.define :namespace, :required => true
11
+ Settings.define :protocol, :required => true
12
+ Settings.define :coderoot, :required => true
13
+ Settings.define :targets, :default => 'catalog'
14
+ Settings.read(machine_config) if File.exists? machine_config
15
+ Settings.read(install_config) if File.exists? install_config
16
+ Settings.resolve!
17
+
18
+ # Hackbox directories to be created
19
+ coderoot = Settings[:coderoot]
20
+ hackbox = File.join(coderoot, Settings[:namespace].gsub(/\./,'/'), Settings[:protocol])
21
+ engine = File.join(hackbox, 'engine')
22
+ config = File.join(hackbox, 'config')
23
+
24
+ # Define idempotent directory tasks
25
+ [ coderoot, hackbox, engine, config ].each { |dir| directory dir }
26
+
27
+ # Hackbox files to be created
28
+ rakefile = File.join(hackbox, 'Rakefile')
29
+ main = File.join(engine, 'main')
30
+ config_yml = File.join(config, 'config.yaml')
31
+ icss_yml = File.join(config, "#{Settings[:protocol]}.icss.yaml")
32
+ endpoint = File.join(engine, "#{Settings[:protocol]}_endpoint.rb")
33
+ templates = File.join(hb_lib_dir, 'lib/hackboxen/template')
34
+
35
+ # Create a basic endpoint if apeyeye was specified as a target
36
+ file endpoint, [:config] => engine do |t, args|
37
+ HackBoxen::Template.new(File.join(templates, "endpoint.rb.erb"), endpoint, args[:config]).substitute!
38
+ end
39
+
40
+ # Create a basic hackbox Rakefile
41
+ file rakefile => hackbox do
42
+ HackBoxen::Template.new(File.join(templates, "Rakefile.erb"), rakefile, {}).substitute!
43
+ end
44
+
45
+ # Create a basic executable hackbox main file
46
+ file main => engine do
47
+ HackBoxen::Template.new(File.join(templates, 'main.erb'), main, {}).substitute!
48
+ File.chmod(0755, main)
49
+ end
50
+
51
+ # Create a basic config file
52
+ file config_yml => config do
53
+ basic_config = { 'namespace' => Settings[:namespace], 'protocol' => Settings[:protocol] }
54
+ HackBoxen::Template.new(File.join(templates, "config.yaml.erb"), config_yml, basic_config).substitute!
55
+ end
56
+
57
+ # Create a basic icss file
58
+ file icss_yml => config do
59
+ targets = Settings[:targets].split(',')
60
+ basic_config = {
61
+ 'namespace' => Settings[:namespace],
62
+ 'protocol' => Settings[:protocol],
63
+ 'targets' => targets
64
+ }
65
+ HackBoxen::Template.new(File.join(templates, "icss.yaml.erb"), icss_yml, basic_config).substitute!
66
+ Rake::Task[endpoint].invoke(basic_config) if targets.include? 'apeyeye'
67
+ end
68
+
69
+ task :scaffold => [rakefile, main, config_yml, icss_yml]
70
+
71
+
@@ -0,0 +1,36 @@
1
+ require 'erubis'
2
+
3
+ module HackBoxen
4
+ class Template
5
+
6
+ attr_accessor :source_template, :output_path, :attributes
7
+
8
+ def initialize source_template, output_path, attributes
9
+ @source_template = source_template
10
+ @output_path = output_path
11
+ @attributes = attributes
12
+ end
13
+
14
+ def compile!
15
+ dest << Erubis::Eruby.new(source).result(attributes)
16
+ dest << "\n"
17
+ dest
18
+ end
19
+
20
+ def substitute!
21
+ compile!
22
+ end
23
+
24
+ protected
25
+
26
+ def source
27
+ File.open(source_template).read
28
+ end
29
+
30
+ def dest
31
+ return @dest if @dest
32
+ @dest ||= File.open(output_path, 'w')
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,31 @@
1
+ require 'hackboxen'
2
+ #
3
+ # When you require 'hackboxen' the library establishes where the current hackbox directory
4
+ # is located and loads all required tasks in order for your hackbox to run to completion
5
+ #
6
+
7
+ task :get_data do
8
+ #
9
+ # This task is intended to pull data down from a source. Examples include
10
+ # the web, an ftp server, and Amazon's simple storage service (s3). As much
11
+ # as possible this should be the only task that interacts with the 'outside'
12
+ # world.
13
+ #
14
+ end
15
+
16
+ task :default => ['hb:create_working_config', 'hb:icss', 'hb:endoint', :get_data, 'hb:init']
17
+ #
18
+ # hb:create_working_config makes establishes all required directories and serializes out all
19
+ # configuration options into env/working_config.json. This task is required.
20
+ #
21
+ # hb:icss copies over the icss.yaml file if it exists into its proper place in fixd/data. This
22
+ # task is not required.
23
+ #
24
+ # hb:endpoint copies over the endpoint.rb file if it exists into its proper place in fixd/code.
25
+ # This task is not required.
26
+ #
27
+ # :get_data is explained above. This task (and any other dependent tasks you wish to write) are
28
+ # expected only to pull data into the ripd directory, nothing more. This task is required.
29
+ #
30
+ # hb:init executes the main file located in engine. This task is required.
31
+ #
@@ -0,0 +1,10 @@
1
+ ---
2
+ #
3
+ # This is a sample config. Any hackbox-specific options or parameters that need to be accessed
4
+ # during the execution of a hackbox should be put in here.
5
+ #
6
+ namespace: <%= namespace %>
7
+ protocol: <%= protocol %>
8
+ filesystem_scheme: file
9
+ under_consideration: true # This flag is set to true for initial publishing, then removed when fully complete
10
+ update_frequency: monthly # How often the data is refreshed
@@ -0,0 +1,39 @@
1
+ <% format = "" %>
2
+ <% indent = 0 %>
3
+ <% entries = namespace.split('.') << protocol %>
4
+ <% entries.each_with_index do |part,count| %>
5
+ <% indent = count %>
6
+ <% indent.times { |c| format += " " } %>
7
+ <% if entries[count] == entries.last %>
8
+ <% format += "class #{part.split("_").map { |p| p.capitalize }.join("")}Endpoint < Endpoint\n\n" %>
9
+ <% else %>
10
+ <% format += "module #{part.split("_").map { |p| p.capitalize }.join("")}\n" %>
11
+ <% end %>
12
+ <% end %>
13
+ <% indent += 1 %>
14
+ <% targets.each do |target| %>
15
+ <% case target %>
16
+ <% when 'mysql' %>
17
+ <% indent.times { |c| format += " " } %>
18
+ <% format += "extend Connection::MysqlConnection\n" %>
19
+ <% when 'hbase' %>
20
+ <% indent.times { |c| format += " " } %>
21
+ <% format += "extend Connection::HbaseConnection\n" %>
22
+ <% when 'geo_index' %>
23
+ <% indent.times { |c| format += " " } %>
24
+ <% format += "extend Connection::HbaseGeoConnection\n" %>
25
+ <% when 'elasticsearch' %>
26
+ <% indent.times { |c| format += " " } %>
27
+ <% format += "extend Connection::ElasticSearchConnection\n" %>
28
+ <% end %>
29
+ <% end %>
30
+ <% format += "\n" %>
31
+ <% indent.times { |c| format += " " } %>
32
+ <% format += "Put your endpoint code here:\n\n" %>
33
+ <% indent -= 1 %>
34
+ <% while indent >= 0 %>
35
+ <% indent.times { |c| format += " " } %>
36
+ <% format += "end\n" %>
37
+ <% indent -= 1 %>
38
+ <% end %>
39
+ <%= format %>
@@ -0,0 +1,125 @@
1
+ ---
2
+ namespace: <%= namespace %>
3
+ protocol: <%= protocol %>
4
+
5
+ data_asset:
6
+ - name: <%= protocol %>_data_asset
7
+ location: <%= protocol %>_data.tsv
8
+ type: <%= protocol %>_data_record
9
+
10
+ <% if targets.include? 'apeyeye' %>
11
+ code_asset:
12
+ - name: <%= protocol %>_code_asset
13
+ location: code/<%= protocol %>_endpoint.rb
14
+
15
+ messages:
16
+ <%= protocol %>_search: # An example message name
17
+ request:
18
+ - name: <%= protocol %>_search_request
19
+ type: <%= protocol %>_search_request
20
+ response: <%= protocol %>_search_response_record
21
+ doc: A clear description of how to interact with the api using this message
22
+ samples:
23
+ - request: # A sample request using this message's defined request parameters below
24
+ - param_1_name: value
25
+ param_2_name: value
26
+ param_3_name: value
27
+
28
+ <% end %>
29
+ targets:
30
+ <% targets.each do |target| %>
31
+ <% case target %>
32
+ <% when 'catalog' %>
33
+ catalog:
34
+ - name: <%= protocol %>_catalog_entry
35
+ title: The display title of this catalog entry
36
+ description: -|
37
+ A very detailed description of the entry goes here. Ensure proper formatting and clear concise information about the dataset as this field will be the main visibility point of the dataset page.
38
+ tags:
39
+ - an
40
+ - array
41
+ - of
42
+ - single-word
43
+ - tags
44
+ packages: # You only need this if your dataset will be available for bulk download
45
+ - data_assets:
46
+ - <%= protocol %>_data_asset
47
+ <% if targets.include? 'apeyeye' %>
48
+ messages:
49
+ - an array of message names # needs to match the messages entries up above
50
+ <% end %>
51
+ <% when 'apeyeye' %>
52
+ apeyeye:
53
+ - code_assets:
54
+ - <%= protocol %>_code_asset
55
+ <% when 'hbase' %>
56
+ hbase:
57
+ # When your data has the following schema (row_key, column_family, column_name, column_value), use
58
+ - table_name: The hbase table to write data into
59
+ column_families: An array of column families to write data to
60
+ loader: fourple_loader
61
+ data_assets
62
+ - <%= protocol %>_data_asset
63
+ # When your data is simply a tsv record, use these hashes instead
64
+ - table_name: The hbase table to write data into
65
+ column_family: A single column family to write data to
66
+ id_field: The name of the field to use as the row key when indexing
67
+ loader: tsv_loader
68
+ data_assets:
69
+ - <%= protocol %>_data_asset
70
+ <% when 'geo_index' %>
71
+ geo_index:
72
+ - table_name: The hbase table name # must be one of geo_location_infochimps_place, _path or _event
73
+ min_zoom: An integer specifying the minimum zoom level
74
+ max_zoom: An integer specifying the maximum zoom level
75
+ chars_per_page: An integer number of approximately how many characters to display per page
76
+ sort_field: The field within the Properties hash to sort by. use -1 if no field is sorted by
77
+ data_assets:
78
+ - <%= protocol %>_data_asset
79
+ <% when 'elasticsearch' %>
80
+ elasticsearch:
81
+ - index_name: The name of the index to write data into
82
+ object_type: The object type to be created in ElasticSearch
83
+ id_field: Optionally used to define the field to id by during indexing
84
+ loader: Either tsv_loader or json_loader based on your data type
85
+ data_assets:
86
+ - <%= protocol %>_data_asset
87
+ <% when 'mysql' %>
88
+ mysql:
89
+ - database: The name of the MySQL database to be loaded into
90
+ table_name: The name of the corresponding table to be loaded into
91
+ data_assets:
92
+ - <%= protocol %>_data_asset
93
+ <% end %>
94
+ <% end %>
95
+
96
+ # Any non-basic types declared above must be defined explicitly under this type heading
97
+ types:
98
+ - name: <%= protocol %>_data_record
99
+ type: record
100
+ doc: Description of the <%= protocol %>_data_record type
101
+ fields:
102
+ - name: A name for one of the fields in the <%= protocol %>_data_record type
103
+ doc: A description for this field
104
+ type: If this not a primitive type, make sure you explicitly define it below
105
+ - name: A name for one of the fields in the <%= protocol %>_data_record type
106
+ doc: A description for this field
107
+ type: If this not a primitive type, make sure you explicitly define it below
108
+
109
+ <% if targets.include? 'apeyeye' %>
110
+ - name: <%= protocol %>_search_request
111
+ type: record
112
+ doc: Description of the <%= protocol %>_search_request type
113
+ fields:
114
+ - name: A name for one of the fields in the <%= protocol %>_search_request type
115
+ doc: A description for this field
116
+ type: If this not a primitive type, make sure you explicitly define it below
117
+
118
+ - name: <%= protocol %>_search_response_record
119
+ type: record
120
+ doc: Description of the <%= protocol %>_search_response_record type
121
+ fields:
122
+ - name: A name for one of the fields in the <%= protocol %>_search_response_record type
123
+ doc: A description for this field
124
+ type: If this not a primitive type, make sure you explicitly define it below
125
+ <% end %>
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # A simple example of executable main file. This script is NOT required to be ruby.
4
+ #
5
+
6
+ #
7
+ # inputdir is the first argument your main script will get. It will ALWAYS get this. inputdir
8
+ # will ALWAYS be a directory that contains (ripd/, rawd/, fixd/, env/, and log/).
9
+ #
10
+ inputdir = ARGV[0]
11
+
12
+ #
13
+ # outputdir is the second argument your main script will get. It will ALWAYS get this. outputdir
14
+ # will ALWAYS be the fixd/data/ directory
15
+ #
16
+ outputdir = ARGV[1]
17
+
18
+ #
19
+ # Ruby example: read in the working_environment.json file in env/ into a ruby hash
20
+ # (same as a javascript associative array, a java hashmap, a python dictionary, etc)
21
+ # called 'options' to access the configuration settings used to execute the Rakefile
22
+ #
23
+ require 'json'
24
+ options = JSON.parse(File.read(File.join(inputdir, "env", "working_environment.json")))
25
+
26
+ #
27
+ # If you require 'hackboxen' you can access the default paths utility method
28
+ #
29
+ require 'hackboxen'
30
+ path_to :fixd_dir # => "[current_dataroot]/fixd/"
31
+ path_to :hb_engine # => "[current_hackbox]/engine/"
@@ -0,0 +1,49 @@
1
+ WorkingConfig = Configliere::Param.new
2
+ WorkingConfig.use :commandline, :config_file
3
+
4
+ module HackBoxen
5
+
6
+ autoload :ConfigValidator, 'hackboxen/utils/config_validator'
7
+ autoload :Paths, 'hackboxen/utils/paths'
8
+ autoload :Logging, 'hackboxen/utils/logging'
9
+
10
+ def self.find_root_dir
11
+ start_dir = File.dirname INCLUDING_FILE
12
+ Dir.chdir start_dir
13
+ until hackbox_root? Dir.pwd
14
+ Dir.chdir('..')
15
+ if Dir.pwd == '/'
16
+ puts "Warning: not in a Hackbox base directory"
17
+ return start_dir
18
+ end
19
+ end
20
+ return Dir.pwd
21
+ end
22
+
23
+ def self.hackbox_root? dir = Dir.pwd
24
+ %w[ engine config Rakefile ].each do |expected|
25
+ return false unless Dir.entries(dir).include? expected
26
+ end
27
+ true
28
+ end
29
+
30
+ def self.current_fs
31
+ fs = WorkingConfig[:filesystem_scheme] ? WorkingConfig[:filesystem_scheme] : 'file'
32
+ Swineherd::FileSystem.get fs
33
+ end
34
+
35
+ def self.current
36
+ hackbox_root? ? File.join(WorkingConfig[:namespace].gsub('.', '/'), WorkingConfig[:protocol]) : 'debug'
37
+ end
38
+
39
+ def self.verify_dependencies
40
+ %w[ dataroot namespace protocol ].each do |req|
41
+ raise "Your hackbox config appears to be missing a [#{req}]" unless WorkingConfig[req.to_sym]
42
+ end
43
+ end
44
+
45
+ def self.read_config cfg
46
+ WorkingConfig.read cfg if current_fs.exists?(cfg)
47
+ end
48
+ end
49
+
@@ -0,0 +1,63 @@
1
+ h1. Execution Environment Validator
2
+
3
+ Hackboxen usually require resources in their execution environment. If the @WorkingConfig@ for a hackbox contains the key @requires@, then its value must be a hash that declares its requirements. This declaration takes the form of a tree of hashes where each terminal keys specifies a particular requirement and the value associated with is key is a configuration specifier for that requirement.
4
+
5
+ h2. Requirement Values
6
+
7
+ The value for each key may be one of a:
8
+
9
+ * **Null:** This requirement must exist, but exact configuration does not need to be precisely stated.
10
+ * **String:** This requirement must exist and its configuration (e.g. version constraint, location) is specified in the string.
11
+ * **Array Of Strings:** This requirement has multiple configuration constraints (e.g. min/max version, access to multiple mysql databases)
12
+ * **Hash:** The key is a category rather than an actual requirement. The value contains actual requirements or subcategories.
13
+
14
+ The meaning of strings as values is defined by its key. In general, version strings should be specified in Bundler Gemfile syntax. Currently, the evaluator does not actually interpret value strings-- it only checks for the existence of keys. However, these values may be needed by external tools or systems and so should be specified if a value other than the default is required.
15
+
16
+ h2. Schema
17
+
18
+ The following is the current schema for the top of the @requires@ tree (default versions in parentheses):
19
+
20
+ * **platform:** The processing environment for this hackbox
21
+ ** *os:* One of "linux", "osx", "win". ("linux")
22
+ ** *hardware:* One of "x86", "x86_64" ("x86")
23
+ * **language:** Languages and/or their libraries
24
+ ** **ruby:** The @RVM@ ruby version needed by this hackbox ("1.8.7")
25
+ ** **jars:** If the ruby version is a jruby version, then the needed external jars should be named in this hash.
26
+ ** **python:** The minimum python version needed by this hackbox ("2.6")
27
+ * **processing:** These are data processing tools and resources that need to be available
28
+ ** **pig:**: Apache Pig is installed and configured.
29
+ ** **wukong:**: Wukong Hadoop streaming processor is installed and configured.
30
+ * **shelltools:** A reference to command line tools that must be callable via a shell in the default @PATH@.
31
+ * **datastore:** Datastores that must be accessible by this hackbox. If the value for a datastore is @null@, then the default store is needed. If the value is a string, then this is the "name" of the required store. If the value is an array of strings, then all of the named stores are required.
32
+ ** **mysql:**
33
+ ** **elasticsearch:**
34
+ ** **hbase:**
35
+ * **filesystems:** These are the non-local filesystems that the hackbox needs and will access through the filesystem abstraction in swineherd. Local filesystems are always expected to be available.
36
+ ** **hdfs:**
37
+ ** **s3:**
38
+
39
+ h2. Example
40
+
41
+ An example YAML @requires@ specification should look something like:
42
+
43
+ <pre><code>
44
+ requires:
45
+ language:
46
+ ruby: 1.9
47
+ python: 2.6
48
+ jars:
49
+ xerces: 4.5
50
+ shelltools:
51
+ wget: null
52
+ curl: null
53
+ tar: null
54
+ gcc: null
55
+ datastore:
56
+ mysql: null
57
+ hbase: null
58
+ </code></pre>
59
+
60
+ h2. Evaluation
61
+
62
+ To be implemented.
63
+