RubyGems - data_hut - Versions diffs - 0.0.8 → 0.0.9 - Mend

data_hut 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data/.gitignore +1 -1
data/.ruby-gemset +1 -0
data/.ruby-version +1 -0
data/.yardopts +8 -0
data/CHANGELOG.md +5 -1
data/README.md +6 -0
data/lib/data_hut/data_warehouse.rb +9 -1
data/lib/data_hut/version.rb +1 -1
data/samples/common/report.html.haml +1 -1
data/samples/common/samples.gemfile +2 -1
data/samples/league_of_legends.rb +6 -0
data/samples/lol_lore_relationships.rb +101 -0
data/samples/weather_station.rb +3 -4
data/test/spec/basic_test.rb +43 -0
metadata +7 -4
data/.rvmrc +0 -1

data/.gitignore CHANGED Viewed

@@ -17,4 +17,4 @@ test/version_tmp
 tmp
 *.db
 samples/common/samples.gemfile.lock
-samples/weather_report.html
+samples/output

data/.ruby-gemset ADDED Viewed

	@@ -0,0 +1 @@
1	+ data_hut

data/.ruby-version ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1.9.3

data/.yardopts ADDED Viewed

@@ -0,0 +1,8 @@
+--no-private
+--protected
+--title "DataHut Documentation"
+--markup markdown
+--readme README.md
+-
+LICENSE
+CHANGELOG.md

data/CHANGELOG.md CHANGED Viewed

@@ -1,9 +1,13 @@
 # Changelog
+## 0.0.9
+* added to_json support for dataset results.
 ## 0.0.8
 * handle unsanitized nil values properly - If your input data has occasional nil values during extract or transform, you may have seen:
-    DataHut: Ruby type 'NilClass' not supported by Sequel...
+        `DataHut: Ruby type 'NilClass' not supported by Sequel...`
   DataHut now handles nil values instead of raising this exception so that it is easier to work with unsanitized datasets.
 * added `DataHut::DataWarehouse#non_unique` which allows you to specify any test of uniqueness for early skipping during transform or extract phases.  DataHut has duplicate detection built-in, i.e. it doesn't allow identical records to be inserted.  However in the past, you had to wait for all the fields to be added or transformed before this detection was done.  `non-unique` allows you to define more specific uniqueness paramters for early skipping without going through all that.  i.e. you have a feed where you know a dup is some kind of GUID... simply test if the GUID is unique *before* going any further...

data/README.md CHANGED Viewed

@@ -91,7 +91,13 @@ And results remain Sequel::Model objects, so you can access fields with object n
     [34] pry(main)> record.age
     => 44
+Or you can output results directly to JSON.
+    [1] pry(main)> puts ds.group_and_count(:name).all.to_json
+    [{"name":"barney","count":3},{"name":"fred","count":1},{"name":"phil","count":2}]
+(See [samples/weather_station.rb](https://github.com/coldnebo/data_hut/blob/master/samples/weather_station.rb) for an example of using JSON output to visualize data with d3.js.)
 Read more about the [Sequel gem](http://sequel.rubyforge.org/) to determine what operations you can perform on a DataHut dataset.
 ## A More Ambitious Example...

data/lib/data_hut/data_warehouse.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 require 'sequel'
 require 'ostruct'
 require 'logger'
+require 'json'
 module DataHut
@@ -58,9 +59,16 @@ module DataHut
     # access the DataHut dataset. See {http://sequel.rubyforge.org/rdoc/classes/Sequel/Dataset.html Sequel::Dataset}
     # for available operations on the dataset.
     #
+    # @note the resulting [Sequel::Model] additionally supports a #to_json method for JSON export of the dataset results.
     # @return [Sequel::Model] instance bound to the data warehouse. Use this handle to query and analyze the datahut.
     def dataset
-      Class.new(Sequel::Model(@db[:data_warehouse]))
+      klass = Class.new(Sequel::Model(@db[:data_warehouse]))
+      klass.class_eval do
+        def to_json(*a)
+          values.to_json(*a)
+        end
+      end
+      klass
     end
     # used to extract data from whatever source you wish. As long as the data forms an enumerable collection,

data/lib/data_hut/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module DataHut
-  VERSION = "0.0.8"
+  VERSION = "0.0.9"
 end

data/samples/common/report.html.haml CHANGED Viewed

@@ -2,7 +2,7 @@
 %html
   %head
     %title= @title
-    %script{src: "common/d3.v3.min.js", type: "text/javascript"}
+    %script{src: "../common/d3.v3.min.js", type: "text/javascript"}
     :css
       #{@css}
   %body

data/samples/common/samples.gemfile CHANGED Viewed

@@ -1,4 +1,4 @@
-source :rubygems
+source 'https://rubygems.org'
 # these are gems used in the samples.
 # install with:
@@ -6,3 +6,4 @@ source :rubygems
 gem 'nokogiri'
 gem 'haml'
+gem 'highline'

data/samples/league_of_legends.rb CHANGED Viewed

@@ -23,13 +23,16 @@ unless File.exists?("lolstats.db")
   # keep the powers for later since they are on different pages.
   powers = {}
+  thumbnails = {}
   champions_page.css('table.champion_item').each do |c|
     name        = c.css('td.description span.highlight a').text
     attack      = c.css('td.graphing td.filled_attack').count
     health      = c.css('td.graphing td.filled_health').count
     spells      = c.css('td.graphing td.filled_spells').count
     difficulty  = c.css('td.graphing td.filled_difficulty').count
+    thumbnail   = c.css('td.champion a img/@src').to_s
     powers.store(name, {attack_power: attack, defense_power: health, ability_power: spells, difficulty: difficulty})
+    thumbnails.store(name,thumbnail)
   end
   puts "loading champion data"
@@ -41,6 +44,7 @@ unless File.exists?("lolstats.db")
     names = st.css('td.stats_name').collect{|e| e.text.strip.downcase.gsub(/ /,'_')}
     values = st.css('td.stats_value').collect{|e| e.text.strip}
     modifiers = st.css('td.stats_modifier').collect{|e| e.text.strip}
+    lore = champion_page.css('table.lore_table td.champion_description').text
     # DataHut also allows you to store metadata for the data warehouse during any processing phase for later retrieval.
     # Since we extract the data only once, but may need stats names for subsequent transforms, we can store the
@@ -61,6 +65,8 @@ unless File.exists?("lolstats.db")
     r.defense_power = power[:defense_power]
     r.ability_power = power[:ability_power]
     r.difficulty = power[:difficulty]
+    r.lore = lore
+    r.thumbnail = thumbnails[r.name]
     print "."
   end

data/samples/lol_lore_relationships.rb ADDED Viewed

@@ -0,0 +1,101 @@
+# run from the samples dir with:
+# $ rake samples
+# $ ruby league_of_legends.rb
+# then
+# $ ruby lol_lore_relationships
+require_relative 'common/sample_helper.rb'
+require 'data_hut'
+require 'pry'
+require 'json'
+require 'highline/import'
+require 'nokogiri'
+require 'open-uri'
+# helper method to highlight and underline relations and places in the lore text
+def highlight(text,relations,places)
+  highlight = String.new(text)
+  relations.each do |relation|
+    highlight.gsub!(/(#{relation["name"]})/) {"\033[7m#{$1}\033[0m"}
+  end
+  places.each do |place|
+    highlight.gsub!(/(#{place})/) {"\033[4m#{$1}\033[0m"}
+  end
+  highlight
+end
+raise "don't forget to run 'league_of_legends' sample first!" unless File.exists?("lolstats.db")
+dh = DataHut.connect("lolstats")
+ds = dh.dataset
+# get the places of origin if they haven't already been loaded.
+places_of_origin = dh.fetch_meta(:places_of_origin)
+if places_of_origin.nil?
+  doc = Nokogiri::HTML(open("http://leagueoflegends.wikia.com/wiki/Category:Places"))
+  all_places = doc.css('div#mw-pages a').collect {|n| n.text}
+  doc = Nokogiri::HTML(open("http://leagueoflegends.wikia.com/wiki/Category:Fields_of_Justice"))
+  fields_of_justice = doc.css('div#mw-pages a').collect {|n| n.text}
+  places_of_origin = all_places - fields_of_justice - ["The League of Legends"]
+  dh.store_meta(:places_of_origin, places_of_origin)
+end
+# collect the champion names from the existing data.
+names = ds.collect{|r|r.name}
+# now, for each champion record in the data, add a set of relationships to other champions and a flag
+# indicating whether these relationships have been reviewed or not.
+dh.transform do |r|
+  # we'll search the single works and word pairs for the names (since some names have a space)
+  lore_words = r.lore.split(/\s+|\b/)
+  lore_pairs = []
+  lore_words.each_cons(2){|s| lore_pairs.push s.join(' ')}
+  # for the champions with single names, try to match, no?
+  relations = names & lore_words
+  # now match any with spaces in their names by matching against pairings. (we'll get them this time!)
+  relations.concat((names & lore_pairs))
+  relations = relations.reject{|d| d == r.name} # don't include ourself in the relations if mentioned.
+  relations = relations.collect{|d| {name:d}}
+  # does this motivate storing blobs?  No, and I'll tell you why: https://github.com/coldnebo/data_hut/wiki/not-everything-can-be-a-blob
+  r.relations = relations.to_json.to_s
+  r.reviewed_relations = false
+end
+# now grab all the non-empty relations and display them for consideration...
+non_empty_relations = ds.reject{|r| r.relations == "[]"}
+puts "current non-empty champion relations:"
+non_empty_relations.each do |r|
+  puts "#{r.name}: #{r.relations}"
+end
+# identifying the relationships automatically is a little too complex even with AI, so
+# instead, we'll opt for manual review...
+non_empty_relations.each do |r|
+  next if r.reviewed_relations
+  relations = JSON.parse(r.relations)
+  puts "--------------------------------"
+  puts "Champion: #{r.name}"
+  puts "Lore: "
+  puts highlight(r.lore, relations, places_of_origin)
+  puts "\nBased on your reading of the lore above, how would you classify #{r.name}'s relationships?"
+  r.reviewed_relations = true
+  relations.each do |relation|
+    relation['type'] = ask( "#{relation['name']} is #{r.name}'s: " )
+    if relation['type'].empty?
+      r.reviewed_relations = false
+    end
+  end
+  r.relations = relations.to_json.to_s
+  r.save_changes
+  break unless agree("continue? (y|n)", true)
+end
+#binding.pry
+puts "done."

data/samples/weather_station.rb CHANGED Viewed

@@ -10,18 +10,17 @@ require 'nokogiri'
 require 'open-uri'
 require 'pry'
 require 'haml'
-require 'json'
 def generate_report(ds)
   @title      = "Boston Weather Forecast"
   @h1         = "Forecasted Temperatures Report for Boston, MA, USA"
-  @data       = ds.order(:start_time).collect{|d| d.to_hash}.to_json
+  @data       = ds.order(:start_time).all.to_json
   @css        = File.read("weather_files/weather.css")
   @js         = File.read("weather_files/weather.js")
   engine      = Haml::Engine.new(File.read("common/report.html.haml"))
-  report_name = "weather_report.html"
+  report_name = "output/weather_report.html"
+  FileUtils.mkdir("output") unless Dir.exists?("output")
   File.open(report_name, "w") do |f|
     f.puts engine.render(self)
   end

data/test/spec/basic_test.rb CHANGED Viewed

@@ -262,5 +262,48 @@ describe DataHut do
   end
+  describe "json export" do
+    def setup
+      @dh = DataHut.connect("foo")
+      # first data pull
+      data = [{name: "barney", age: 27},
+              {name: "barney", age: 17},
+              {name: "barney", age: 37},
+              {name: "phil", age: 35},
+              {name: "phil", age: 31},
+              {name: "fred", age: 44}]
+      @dh.extract(data) do |r, d|
+        r.name = d[:name]
+        r.age = d[:age]
+      end
+    end
+    it "should provide json export" do
+      json = @dh.dataset.all.to_json
+      # should be valid json
+      result = JSON.parse(json)
+      assert(Array, result.class)
+      assert(Hash, result.first.class)
+      assert({"dw_id"=>1, "dw_processed"=>false, "name"=>"barney", "age"=>27}, result.first)
+    end
+    it "should provide json for calcs" do
+      # this collection doesn't convert to json using the Sequel :json_serializer plugin
+      # so using default json instead. see lib/data_hut/data_warehouse.rb:67
+      json = @dh.dataset.group_and_count(:name).all.to_json
+      # should be valid json
+      result = JSON.parse(json)
+      assert(3,result.select{|r| r["name"] == "barney"}.first["count"])
+      assert(2,result.select{|r| r["name"] == "phil"}.first["count"])
+      assert(1,result.select{|r| r["name"] == "fred"}.first["count"])
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: data_hut
 version: !ruby/object:Gem::Version
-  version: 0.0.8
+  version: 0.0.9
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-02-25 00:00:00.000000000 Z
+date: 2013-10-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: sequel
@@ -131,7 +131,9 @@ extensions: []
 extra_rdoc_files: []
 files:
 - .gitignore
-- .rvmrc
+- .ruby-gemset
+- .ruby-version
+- .yardopts
 - CHANGELOG.md
 - Gemfile
 - LICENSE
@@ -148,6 +150,7 @@ files:
 - samples/common/sample_helper.rb
 - samples/common/samples.gemfile
 - samples/league_of_legends.rb
+- samples/lol_lore_relationships.rb
 - samples/reddit_science.rb
 - samples/weather_files/screenshot.png
 - samples/weather_files/weather.css
@@ -176,7 +179,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.24
+rubygems_version: 1.8.25
 signing_key:
 specification_version: 3
 summary: Like a data warehouse, but smaller.

data/.rvmrc DELETED Viewed

	@@ -1 +0,0 @@
1	- rvm 1.9.3@data_hut --create