data_hut 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -1
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.yardopts +8 -0
- data/CHANGELOG.md +5 -1
- data/README.md +6 -0
- data/lib/data_hut/data_warehouse.rb +9 -1
- data/lib/data_hut/version.rb +1 -1
- data/samples/common/report.html.haml +1 -1
- data/samples/common/samples.gemfile +2 -1
- data/samples/league_of_legends.rb +6 -0
- data/samples/lol_lore_relationships.rb +101 -0
- data/samples/weather_station.rb +3 -4
- data/test/spec/basic_test.rb +43 -0
- metadata +7 -4
- data/.rvmrc +0 -1
data/.gitignore
CHANGED
data/.ruby-gemset
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
data_hut
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.9.3
|
data/.yardopts
ADDED
data/CHANGELOG.md
CHANGED
@@ -1,9 +1,13 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## 0.0.9
|
4
|
+
|
5
|
+
* added to_json support for dataset results.
|
6
|
+
|
3
7
|
## 0.0.8
|
4
8
|
|
5
9
|
* handle unsanitized nil values properly - If your input data has occasional nil values during extract or transform, you may have seen:
|
6
|
-
|
10
|
+
`DataHut: Ruby type 'NilClass' not supported by Sequel...`
|
7
11
|
DataHut now handles nil values instead of raising this exception so that it is easier to work with unsanitized datasets.
|
8
12
|
|
9
13
|
* added `DataHut::DataWarehouse#non_unique` which allows you to specify any test of uniqueness for early skipping during transform or extract phases. DataHut has duplicate detection built-in, i.e. it doesn't allow identical records to be inserted. However in the past, you had to wait for all the fields to be added or transformed before this detection was done. `non-unique` allows you to define more specific uniqueness paramters for early skipping without going through all that. i.e. you have a feed where you know a dup is some kind of GUID... simply test if the GUID is unique *before* going any further...
|
data/README.md
CHANGED
@@ -91,7 +91,13 @@ And results remain Sequel::Model objects, so you can access fields with object n
|
|
91
91
|
[34] pry(main)> record.age
|
92
92
|
=> 44
|
93
93
|
|
94
|
+
Or you can output results directly to JSON.
|
94
95
|
|
96
|
+
[1] pry(main)> puts ds.group_and_count(:name).all.to_json
|
97
|
+
[{"name":"barney","count":3},{"name":"fred","count":1},{"name":"phil","count":2}]
|
98
|
+
|
99
|
+
(See [samples/weather_station.rb](https://github.com/coldnebo/data_hut/blob/master/samples/weather_station.rb) for an example of using JSON output to visualize data with d3.js.)
|
100
|
+
|
95
101
|
Read more about the [Sequel gem](http://sequel.rubyforge.org/) to determine what operations you can perform on a DataHut dataset.
|
96
102
|
|
97
103
|
## A More Ambitious Example...
|
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'sequel'
|
2
2
|
require 'ostruct'
|
3
3
|
require 'logger'
|
4
|
+
require 'json'
|
4
5
|
|
5
6
|
module DataHut
|
6
7
|
|
@@ -58,9 +59,16 @@ module DataHut
|
|
58
59
|
# access the DataHut dataset. See {http://sequel.rubyforge.org/rdoc/classes/Sequel/Dataset.html Sequel::Dataset}
|
59
60
|
# for available operations on the dataset.
|
60
61
|
#
|
62
|
+
# @note the resulting [Sequel::Model] additionally supports a #to_json method for JSON export of the dataset results.
|
61
63
|
# @return [Sequel::Model] instance bound to the data warehouse. Use this handle to query and analyze the datahut.
|
62
64
|
def dataset
|
63
|
-
Class.new(Sequel::Model(@db[:data_warehouse]))
|
65
|
+
klass = Class.new(Sequel::Model(@db[:data_warehouse]))
|
66
|
+
klass.class_eval do
|
67
|
+
def to_json(*a)
|
68
|
+
values.to_json(*a)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
klass
|
64
72
|
end
|
65
73
|
|
66
74
|
# used to extract data from whatever source you wish. As long as the data forms an enumerable collection,
|
data/lib/data_hut/version.rb
CHANGED
@@ -23,13 +23,16 @@ unless File.exists?("lolstats.db")
|
|
23
23
|
|
24
24
|
# keep the powers for later since they are on different pages.
|
25
25
|
powers = {}
|
26
|
+
thumbnails = {}
|
26
27
|
champions_page.css('table.champion_item').each do |c|
|
27
28
|
name = c.css('td.description span.highlight a').text
|
28
29
|
attack = c.css('td.graphing td.filled_attack').count
|
29
30
|
health = c.css('td.graphing td.filled_health').count
|
30
31
|
spells = c.css('td.graphing td.filled_spells').count
|
31
32
|
difficulty = c.css('td.graphing td.filled_difficulty').count
|
33
|
+
thumbnail = c.css('td.champion a img/@src').to_s
|
32
34
|
powers.store(name, {attack_power: attack, defense_power: health, ability_power: spells, difficulty: difficulty})
|
35
|
+
thumbnails.store(name,thumbnail)
|
33
36
|
end
|
34
37
|
|
35
38
|
puts "loading champion data"
|
@@ -41,6 +44,7 @@ unless File.exists?("lolstats.db")
|
|
41
44
|
names = st.css('td.stats_name').collect{|e| e.text.strip.downcase.gsub(/ /,'_')}
|
42
45
|
values = st.css('td.stats_value').collect{|e| e.text.strip}
|
43
46
|
modifiers = st.css('td.stats_modifier').collect{|e| e.text.strip}
|
47
|
+
lore = champion_page.css('table.lore_table td.champion_description').text
|
44
48
|
|
45
49
|
# DataHut also allows you to store metadata for the data warehouse during any processing phase for later retrieval.
|
46
50
|
# Since we extract the data only once, but may need stats names for subsequent transforms, we can store the
|
@@ -61,6 +65,8 @@ unless File.exists?("lolstats.db")
|
|
61
65
|
r.defense_power = power[:defense_power]
|
62
66
|
r.ability_power = power[:ability_power]
|
63
67
|
r.difficulty = power[:difficulty]
|
68
|
+
r.lore = lore
|
69
|
+
r.thumbnail = thumbnails[r.name]
|
64
70
|
|
65
71
|
print "."
|
66
72
|
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
|
2
|
+
# run from the samples dir with:
|
3
|
+
# $ rake samples
|
4
|
+
# $ ruby league_of_legends.rb
|
5
|
+
# then
|
6
|
+
# $ ruby lol_lore_relationships
|
7
|
+
|
8
|
+
require_relative 'common/sample_helper.rb'
|
9
|
+
|
10
|
+
require 'data_hut'
|
11
|
+
require 'pry'
|
12
|
+
require 'json'
|
13
|
+
require 'highline/import'
|
14
|
+
require 'nokogiri'
|
15
|
+
require 'open-uri'
|
16
|
+
|
17
|
+
# helper method to highlight and underline relations and places in the lore text
|
18
|
+
def highlight(text,relations,places)
|
19
|
+
highlight = String.new(text)
|
20
|
+
relations.each do |relation|
|
21
|
+
highlight.gsub!(/(#{relation["name"]})/) {"\033[7m#{$1}\033[0m"}
|
22
|
+
end
|
23
|
+
places.each do |place|
|
24
|
+
highlight.gsub!(/(#{place})/) {"\033[4m#{$1}\033[0m"}
|
25
|
+
end
|
26
|
+
|
27
|
+
highlight
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
|
32
|
+
raise "don't forget to run 'league_of_legends' sample first!" unless File.exists?("lolstats.db")
|
33
|
+
dh = DataHut.connect("lolstats")
|
34
|
+
ds = dh.dataset
|
35
|
+
|
36
|
+
# get the places of origin if they haven't already been loaded.
|
37
|
+
places_of_origin = dh.fetch_meta(:places_of_origin)
|
38
|
+
if places_of_origin.nil?
|
39
|
+
doc = Nokogiri::HTML(open("http://leagueoflegends.wikia.com/wiki/Category:Places"))
|
40
|
+
all_places = doc.css('div#mw-pages a').collect {|n| n.text}
|
41
|
+
doc = Nokogiri::HTML(open("http://leagueoflegends.wikia.com/wiki/Category:Fields_of_Justice"))
|
42
|
+
fields_of_justice = doc.css('div#mw-pages a').collect {|n| n.text}
|
43
|
+
places_of_origin = all_places - fields_of_justice - ["The League of Legends"]
|
44
|
+
dh.store_meta(:places_of_origin, places_of_origin)
|
45
|
+
end
|
46
|
+
|
47
|
+
# collect the champion names from the existing data.
|
48
|
+
names = ds.collect{|r|r.name}
|
49
|
+
|
50
|
+
# now, for each champion record in the data, add a set of relationships to other champions and a flag
|
51
|
+
# indicating whether these relationships have been reviewed or not.
|
52
|
+
dh.transform do |r|
|
53
|
+
# we'll search the single works and word pairs for the names (since some names have a space)
|
54
|
+
lore_words = r.lore.split(/\s+|\b/)
|
55
|
+
lore_pairs = []
|
56
|
+
lore_words.each_cons(2){|s| lore_pairs.push s.join(' ')}
|
57
|
+
# for the champions with single names, try to match, no?
|
58
|
+
relations = names & lore_words
|
59
|
+
# now match any with spaces in their names by matching against pairings. (we'll get them this time!)
|
60
|
+
relations.concat((names & lore_pairs))
|
61
|
+
relations = relations.reject{|d| d == r.name} # don't include ourself in the relations if mentioned.
|
62
|
+
relations = relations.collect{|d| {name:d}}
|
63
|
+
# does this motivate storing blobs? No, and I'll tell you why: https://github.com/coldnebo/data_hut/wiki/not-everything-can-be-a-blob
|
64
|
+
r.relations = relations.to_json.to_s
|
65
|
+
r.reviewed_relations = false
|
66
|
+
end
|
67
|
+
|
68
|
+
# now grab all the non-empty relations and display them for consideration...
|
69
|
+
non_empty_relations = ds.reject{|r| r.relations == "[]"}
|
70
|
+
|
71
|
+
puts "current non-empty champion relations:"
|
72
|
+
non_empty_relations.each do |r|
|
73
|
+
puts "#{r.name}: #{r.relations}"
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
# identifying the relationships automatically is a little too complex even with AI, so
|
78
|
+
# instead, we'll opt for manual review...
|
79
|
+
non_empty_relations.each do |r|
|
80
|
+
next if r.reviewed_relations
|
81
|
+
relations = JSON.parse(r.relations)
|
82
|
+
puts "--------------------------------"
|
83
|
+
puts "Champion: #{r.name}"
|
84
|
+
puts "Lore: "
|
85
|
+
puts highlight(r.lore, relations, places_of_origin)
|
86
|
+
puts "\nBased on your reading of the lore above, how would you classify #{r.name}'s relationships?"
|
87
|
+
r.reviewed_relations = true
|
88
|
+
relations.each do |relation|
|
89
|
+
relation['type'] = ask( "#{relation['name']} is #{r.name}'s: " )
|
90
|
+
if relation['type'].empty?
|
91
|
+
r.reviewed_relations = false
|
92
|
+
end
|
93
|
+
end
|
94
|
+
r.relations = relations.to_json.to_s
|
95
|
+
r.save_changes
|
96
|
+
break unless agree("continue? (y|n)", true)
|
97
|
+
end
|
98
|
+
|
99
|
+
#binding.pry
|
100
|
+
|
101
|
+
puts "done."
|
data/samples/weather_station.rb
CHANGED
@@ -10,18 +10,17 @@ require 'nokogiri'
|
|
10
10
|
require 'open-uri'
|
11
11
|
require 'pry'
|
12
12
|
require 'haml'
|
13
|
-
require 'json'
|
14
13
|
|
15
14
|
|
16
15
|
def generate_report(ds)
|
17
16
|
@title = "Boston Weather Forecast"
|
18
17
|
@h1 = "Forecasted Temperatures Report for Boston, MA, USA"
|
19
|
-
@data = ds.order(:start_time).
|
18
|
+
@data = ds.order(:start_time).all.to_json
|
20
19
|
@css = File.read("weather_files/weather.css")
|
21
20
|
@js = File.read("weather_files/weather.js")
|
22
21
|
engine = Haml::Engine.new(File.read("common/report.html.haml"))
|
23
|
-
report_name = "weather_report.html"
|
24
|
-
|
22
|
+
report_name = "output/weather_report.html"
|
23
|
+
FileUtils.mkdir("output") unless Dir.exists?("output")
|
25
24
|
File.open(report_name, "w") do |f|
|
26
25
|
f.puts engine.render(self)
|
27
26
|
end
|
data/test/spec/basic_test.rb
CHANGED
@@ -262,5 +262,48 @@ describe DataHut do
|
|
262
262
|
|
263
263
|
end
|
264
264
|
|
265
|
+
|
266
|
+
describe "json export" do
|
267
|
+
def setup
|
268
|
+
@dh = DataHut.connect("foo")
|
269
|
+
|
270
|
+
# first data pull
|
271
|
+
data = [{name: "barney", age: 27},
|
272
|
+
{name: "barney", age: 17},
|
273
|
+
{name: "barney", age: 37},
|
274
|
+
{name: "phil", age: 35},
|
275
|
+
{name: "phil", age: 31},
|
276
|
+
{name: "fred", age: 44}]
|
277
|
+
|
278
|
+
@dh.extract(data) do |r, d|
|
279
|
+
r.name = d[:name]
|
280
|
+
r.age = d[:age]
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
it "should provide json export" do
|
285
|
+
json = @dh.dataset.all.to_json
|
286
|
+
|
287
|
+
# should be valid json
|
288
|
+
result = JSON.parse(json)
|
289
|
+
assert(Array, result.class)
|
290
|
+
assert(Hash, result.first.class)
|
291
|
+
assert({"dw_id"=>1, "dw_processed"=>false, "name"=>"barney", "age"=>27}, result.first)
|
292
|
+
end
|
293
|
+
|
294
|
+
it "should provide json for calcs" do
|
295
|
+
# this collection doesn't convert to json using the Sequel :json_serializer plugin
|
296
|
+
# so using default json instead. see lib/data_hut/data_warehouse.rb:67
|
297
|
+
json = @dh.dataset.group_and_count(:name).all.to_json
|
298
|
+
|
299
|
+
# should be valid json
|
300
|
+
result = JSON.parse(json)
|
301
|
+
|
302
|
+
assert(3,result.select{|r| r["name"] == "barney"}.first["count"])
|
303
|
+
assert(2,result.select{|r| r["name"] == "phil"}.first["count"])
|
304
|
+
assert(1,result.select{|r| r["name"] == "fred"}.first["count"])
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
265
308
|
end
|
266
309
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_hut
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-10-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: sequel
|
@@ -131,7 +131,9 @@ extensions: []
|
|
131
131
|
extra_rdoc_files: []
|
132
132
|
files:
|
133
133
|
- .gitignore
|
134
|
-
- .
|
134
|
+
- .ruby-gemset
|
135
|
+
- .ruby-version
|
136
|
+
- .yardopts
|
135
137
|
- CHANGELOG.md
|
136
138
|
- Gemfile
|
137
139
|
- LICENSE
|
@@ -148,6 +150,7 @@ files:
|
|
148
150
|
- samples/common/sample_helper.rb
|
149
151
|
- samples/common/samples.gemfile
|
150
152
|
- samples/league_of_legends.rb
|
153
|
+
- samples/lol_lore_relationships.rb
|
151
154
|
- samples/reddit_science.rb
|
152
155
|
- samples/weather_files/screenshot.png
|
153
156
|
- samples/weather_files/weather.css
|
@@ -176,7 +179,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
176
179
|
version: '0'
|
177
180
|
requirements: []
|
178
181
|
rubyforge_project:
|
179
|
-
rubygems_version: 1.8.
|
182
|
+
rubygems_version: 1.8.25
|
180
183
|
signing_key:
|
181
184
|
specification_version: 3
|
182
185
|
summary: Like a data warehouse, but smaller.
|
data/.rvmrc
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
rvm 1.9.3@data_hut --create
|