data_hut 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -1
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.yardopts +8 -0
- data/CHANGELOG.md +5 -1
- data/README.md +6 -0
- data/lib/data_hut/data_warehouse.rb +9 -1
- data/lib/data_hut/version.rb +1 -1
- data/samples/common/report.html.haml +1 -1
- data/samples/common/samples.gemfile +2 -1
- data/samples/league_of_legends.rb +6 -0
- data/samples/lol_lore_relationships.rb +101 -0
- data/samples/weather_station.rb +3 -4
- data/test/spec/basic_test.rb +43 -0
- metadata +7 -4
- data/.rvmrc +0 -1
data/.gitignore
CHANGED
data/.ruby-gemset
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
data_hut
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.9.3
|
data/.yardopts
ADDED
data/CHANGELOG.md
CHANGED
@@ -1,9 +1,13 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## 0.0.9
|
4
|
+
|
5
|
+
* added to_json support for dataset results.
|
6
|
+
|
3
7
|
## 0.0.8
|
4
8
|
|
5
9
|
* handle unsanitized nil values properly - If your input data has occasional nil values during extract or transform, you may have seen:
|
6
|
-
|
10
|
+
`DataHut: Ruby type 'NilClass' not supported by Sequel...`
|
7
11
|
DataHut now handles nil values instead of raising this exception so that it is easier to work with unsanitized datasets.
|
8
12
|
|
9
13
|
* added `DataHut::DataWarehouse#non_unique` which allows you to specify any test of uniqueness for early skipping during transform or extract phases. DataHut has duplicate detection built-in, i.e. it doesn't allow identical records to be inserted. However in the past, you had to wait for all the fields to be added or transformed before this detection was done. `non-unique` allows you to define more specific uniqueness paramters for early skipping without going through all that. i.e. you have a feed where you know a dup is some kind of GUID... simply test if the GUID is unique *before* going any further...
|
data/README.md
CHANGED
@@ -91,7 +91,13 @@ And results remain Sequel::Model objects, so you can access fields with object n
|
|
91
91
|
[34] pry(main)> record.age
|
92
92
|
=> 44
|
93
93
|
|
94
|
+
Or you can output results directly to JSON.
|
94
95
|
|
96
|
+
[1] pry(main)> puts ds.group_and_count(:name).all.to_json
|
97
|
+
[{"name":"barney","count":3},{"name":"fred","count":1},{"name":"phil","count":2}]
|
98
|
+
|
99
|
+
(See [samples/weather_station.rb](https://github.com/coldnebo/data_hut/blob/master/samples/weather_station.rb) for an example of using JSON output to visualize data with d3.js.)
|
100
|
+
|
95
101
|
Read more about the [Sequel gem](http://sequel.rubyforge.org/) to determine what operations you can perform on a DataHut dataset.
|
96
102
|
|
97
103
|
## A More Ambitious Example...
|
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'sequel'
|
2
2
|
require 'ostruct'
|
3
3
|
require 'logger'
|
4
|
+
require 'json'
|
4
5
|
|
5
6
|
module DataHut
|
6
7
|
|
@@ -58,9 +59,16 @@ module DataHut
|
|
58
59
|
# access the DataHut dataset. See {http://sequel.rubyforge.org/rdoc/classes/Sequel/Dataset.html Sequel::Dataset}
|
59
60
|
# for available operations on the dataset.
|
60
61
|
#
|
62
|
+
# @note the resulting [Sequel::Model] additionally supports a #to_json method for JSON export of the dataset results.
|
61
63
|
# @return [Sequel::Model] instance bound to the data warehouse. Use this handle to query and analyze the datahut.
|
62
64
|
def dataset
|
63
|
-
Class.new(Sequel::Model(@db[:data_warehouse]))
|
65
|
+
klass = Class.new(Sequel::Model(@db[:data_warehouse]))
|
66
|
+
klass.class_eval do
|
67
|
+
def to_json(*a)
|
68
|
+
values.to_json(*a)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
klass
|
64
72
|
end
|
65
73
|
|
66
74
|
# used to extract data from whatever source you wish. As long as the data forms an enumerable collection,
|
data/lib/data_hut/version.rb
CHANGED
@@ -23,13 +23,16 @@ unless File.exists?("lolstats.db")
|
|
23
23
|
|
24
24
|
# keep the powers for later since they are on different pages.
|
25
25
|
powers = {}
|
26
|
+
thumbnails = {}
|
26
27
|
champions_page.css('table.champion_item').each do |c|
|
27
28
|
name = c.css('td.description span.highlight a').text
|
28
29
|
attack = c.css('td.graphing td.filled_attack').count
|
29
30
|
health = c.css('td.graphing td.filled_health').count
|
30
31
|
spells = c.css('td.graphing td.filled_spells').count
|
31
32
|
difficulty = c.css('td.graphing td.filled_difficulty').count
|
33
|
+
thumbnail = c.css('td.champion a img/@src').to_s
|
32
34
|
powers.store(name, {attack_power: attack, defense_power: health, ability_power: spells, difficulty: difficulty})
|
35
|
+
thumbnails.store(name,thumbnail)
|
33
36
|
end
|
34
37
|
|
35
38
|
puts "loading champion data"
|
@@ -41,6 +44,7 @@ unless File.exists?("lolstats.db")
|
|
41
44
|
names = st.css('td.stats_name').collect{|e| e.text.strip.downcase.gsub(/ /,'_')}
|
42
45
|
values = st.css('td.stats_value').collect{|e| e.text.strip}
|
43
46
|
modifiers = st.css('td.stats_modifier').collect{|e| e.text.strip}
|
47
|
+
lore = champion_page.css('table.lore_table td.champion_description').text
|
44
48
|
|
45
49
|
# DataHut also allows you to store metadata for the data warehouse during any processing phase for later retrieval.
|
46
50
|
# Since we extract the data only once, but may need stats names for subsequent transforms, we can store the
|
@@ -61,6 +65,8 @@ unless File.exists?("lolstats.db")
|
|
61
65
|
r.defense_power = power[:defense_power]
|
62
66
|
r.ability_power = power[:ability_power]
|
63
67
|
r.difficulty = power[:difficulty]
|
68
|
+
r.lore = lore
|
69
|
+
r.thumbnail = thumbnails[r.name]
|
64
70
|
|
65
71
|
print "."
|
66
72
|
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
|
2
|
+
# run from the samples dir with:
|
3
|
+
# $ rake samples
|
4
|
+
# $ ruby league_of_legends.rb
|
5
|
+
# then
|
6
|
+
# $ ruby lol_lore_relationships
|
7
|
+
|
8
|
+
require_relative 'common/sample_helper.rb'
|
9
|
+
|
10
|
+
require 'data_hut'
|
11
|
+
require 'pry'
|
12
|
+
require 'json'
|
13
|
+
require 'highline/import'
|
14
|
+
require 'nokogiri'
|
15
|
+
require 'open-uri'
|
16
|
+
|
17
|
+
# helper method to highlight and underline relations and places in the lore text
|
18
|
+
def highlight(text,relations,places)
|
19
|
+
highlight = String.new(text)
|
20
|
+
relations.each do |relation|
|
21
|
+
highlight.gsub!(/(#{relation["name"]})/) {"\033[7m#{$1}\033[0m"}
|
22
|
+
end
|
23
|
+
places.each do |place|
|
24
|
+
highlight.gsub!(/(#{place})/) {"\033[4m#{$1}\033[0m"}
|
25
|
+
end
|
26
|
+
|
27
|
+
highlight
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
|
32
|
+
raise "don't forget to run 'league_of_legends' sample first!" unless File.exists?("lolstats.db")
|
33
|
+
dh = DataHut.connect("lolstats")
|
34
|
+
ds = dh.dataset
|
35
|
+
|
36
|
+
# get the places of origin if they haven't already been loaded.
|
37
|
+
places_of_origin = dh.fetch_meta(:places_of_origin)
|
38
|
+
if places_of_origin.nil?
|
39
|
+
doc = Nokogiri::HTML(open("http://leagueoflegends.wikia.com/wiki/Category:Places"))
|
40
|
+
all_places = doc.css('div#mw-pages a').collect {|n| n.text}
|
41
|
+
doc = Nokogiri::HTML(open("http://leagueoflegends.wikia.com/wiki/Category:Fields_of_Justice"))
|
42
|
+
fields_of_justice = doc.css('div#mw-pages a').collect {|n| n.text}
|
43
|
+
places_of_origin = all_places - fields_of_justice - ["The League of Legends"]
|
44
|
+
dh.store_meta(:places_of_origin, places_of_origin)
|
45
|
+
end
|
46
|
+
|
47
|
+
# collect the champion names from the existing data.
|
48
|
+
names = ds.collect{|r|r.name}
|
49
|
+
|
50
|
+
# now, for each champion record in the data, add a set of relationships to other champions and a flag
|
51
|
+
# indicating whether these relationships have been reviewed or not.
|
52
|
+
dh.transform do |r|
|
53
|
+
# we'll search the single works and word pairs for the names (since some names have a space)
|
54
|
+
lore_words = r.lore.split(/\s+|\b/)
|
55
|
+
lore_pairs = []
|
56
|
+
lore_words.each_cons(2){|s| lore_pairs.push s.join(' ')}
|
57
|
+
# for the champions with single names, try to match, no?
|
58
|
+
relations = names & lore_words
|
59
|
+
# now match any with spaces in their names by matching against pairings. (we'll get them this time!)
|
60
|
+
relations.concat((names & lore_pairs))
|
61
|
+
relations = relations.reject{|d| d == r.name} # don't include ourself in the relations if mentioned.
|
62
|
+
relations = relations.collect{|d| {name:d}}
|
63
|
+
# does this motivate storing blobs? No, and I'll tell you why: https://github.com/coldnebo/data_hut/wiki/not-everything-can-be-a-blob
|
64
|
+
r.relations = relations.to_json.to_s
|
65
|
+
r.reviewed_relations = false
|
66
|
+
end
|
67
|
+
|
68
|
+
# now grab all the non-empty relations and display them for consideration...
|
69
|
+
non_empty_relations = ds.reject{|r| r.relations == "[]"}
|
70
|
+
|
71
|
+
puts "current non-empty champion relations:"
|
72
|
+
non_empty_relations.each do |r|
|
73
|
+
puts "#{r.name}: #{r.relations}"
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
# identifying the relationships automatically is a little too complex even with AI, so
|
78
|
+
# instead, we'll opt for manual review...
|
79
|
+
non_empty_relations.each do |r|
|
80
|
+
next if r.reviewed_relations
|
81
|
+
relations = JSON.parse(r.relations)
|
82
|
+
puts "--------------------------------"
|
83
|
+
puts "Champion: #{r.name}"
|
84
|
+
puts "Lore: "
|
85
|
+
puts highlight(r.lore, relations, places_of_origin)
|
86
|
+
puts "\nBased on your reading of the lore above, how would you classify #{r.name}'s relationships?"
|
87
|
+
r.reviewed_relations = true
|
88
|
+
relations.each do |relation|
|
89
|
+
relation['type'] = ask( "#{relation['name']} is #{r.name}'s: " )
|
90
|
+
if relation['type'].empty?
|
91
|
+
r.reviewed_relations = false
|
92
|
+
end
|
93
|
+
end
|
94
|
+
r.relations = relations.to_json.to_s
|
95
|
+
r.save_changes
|
96
|
+
break unless agree("continue? (y|n)", true)
|
97
|
+
end
|
98
|
+
|
99
|
+
#binding.pry
|
100
|
+
|
101
|
+
puts "done."
|
data/samples/weather_station.rb
CHANGED
@@ -10,18 +10,17 @@ require 'nokogiri'
|
|
10
10
|
require 'open-uri'
|
11
11
|
require 'pry'
|
12
12
|
require 'haml'
|
13
|
-
require 'json'
|
14
13
|
|
15
14
|
|
16
15
|
def generate_report(ds)
|
17
16
|
@title = "Boston Weather Forecast"
|
18
17
|
@h1 = "Forecasted Temperatures Report for Boston, MA, USA"
|
19
|
-
@data = ds.order(:start_time).
|
18
|
+
@data = ds.order(:start_time).all.to_json
|
20
19
|
@css = File.read("weather_files/weather.css")
|
21
20
|
@js = File.read("weather_files/weather.js")
|
22
21
|
engine = Haml::Engine.new(File.read("common/report.html.haml"))
|
23
|
-
report_name = "weather_report.html"
|
24
|
-
|
22
|
+
report_name = "output/weather_report.html"
|
23
|
+
FileUtils.mkdir("output") unless Dir.exists?("output")
|
25
24
|
File.open(report_name, "w") do |f|
|
26
25
|
f.puts engine.render(self)
|
27
26
|
end
|
data/test/spec/basic_test.rb
CHANGED
@@ -262,5 +262,48 @@ describe DataHut do
|
|
262
262
|
|
263
263
|
end
|
264
264
|
|
265
|
+
|
266
|
+
describe "json export" do
|
267
|
+
def setup
|
268
|
+
@dh = DataHut.connect("foo")
|
269
|
+
|
270
|
+
# first data pull
|
271
|
+
data = [{name: "barney", age: 27},
|
272
|
+
{name: "barney", age: 17},
|
273
|
+
{name: "barney", age: 37},
|
274
|
+
{name: "phil", age: 35},
|
275
|
+
{name: "phil", age: 31},
|
276
|
+
{name: "fred", age: 44}]
|
277
|
+
|
278
|
+
@dh.extract(data) do |r, d|
|
279
|
+
r.name = d[:name]
|
280
|
+
r.age = d[:age]
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
it "should provide json export" do
|
285
|
+
json = @dh.dataset.all.to_json
|
286
|
+
|
287
|
+
# should be valid json
|
288
|
+
result = JSON.parse(json)
|
289
|
+
assert(Array, result.class)
|
290
|
+
assert(Hash, result.first.class)
|
291
|
+
assert({"dw_id"=>1, "dw_processed"=>false, "name"=>"barney", "age"=>27}, result.first)
|
292
|
+
end
|
293
|
+
|
294
|
+
it "should provide json for calcs" do
|
295
|
+
# this collection doesn't convert to json using the Sequel :json_serializer plugin
|
296
|
+
# so using default json instead. see lib/data_hut/data_warehouse.rb:67
|
297
|
+
json = @dh.dataset.group_and_count(:name).all.to_json
|
298
|
+
|
299
|
+
# should be valid json
|
300
|
+
result = JSON.parse(json)
|
301
|
+
|
302
|
+
assert(3,result.select{|r| r["name"] == "barney"}.first["count"])
|
303
|
+
assert(2,result.select{|r| r["name"] == "phil"}.first["count"])
|
304
|
+
assert(1,result.select{|r| r["name"] == "fred"}.first["count"])
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
265
308
|
end
|
266
309
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_hut
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-10-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: sequel
|
@@ -131,7 +131,9 @@ extensions: []
|
|
131
131
|
extra_rdoc_files: []
|
132
132
|
files:
|
133
133
|
- .gitignore
|
134
|
-
- .
|
134
|
+
- .ruby-gemset
|
135
|
+
- .ruby-version
|
136
|
+
- .yardopts
|
135
137
|
- CHANGELOG.md
|
136
138
|
- Gemfile
|
137
139
|
- LICENSE
|
@@ -148,6 +150,7 @@ files:
|
|
148
150
|
- samples/common/sample_helper.rb
|
149
151
|
- samples/common/samples.gemfile
|
150
152
|
- samples/league_of_legends.rb
|
153
|
+
- samples/lol_lore_relationships.rb
|
151
154
|
- samples/reddit_science.rb
|
152
155
|
- samples/weather_files/screenshot.png
|
153
156
|
- samples/weather_files/weather.css
|
@@ -176,7 +179,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
176
179
|
version: '0'
|
177
180
|
requirements: []
|
178
181
|
rubyforge_project:
|
179
|
-
rubygems_version: 1.8.
|
182
|
+
rubygems_version: 1.8.25
|
180
183
|
signing_key:
|
181
184
|
specification_version: 3
|
182
185
|
summary: Like a data warehouse, but smaller.
|
data/.rvmrc
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
rvm 1.9.3@data_hut --create
|