hirsute 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ # various utility methods
2
+ require 'lib/histoparse.rb'
3
+
4
+ module Hirsute
5
+ module Support
6
+
7
+ include Hirsute::HistoParse
8
+
9
+ # store a map of range objects to its constituent array. But we want to store it within the Module's eigenclass so it's shared
10
+ # across the code base
11
+ Hirsute::Support.instance_eval {@rangeToArray = Hash.new}
12
+
13
+ # return the class object for the given string. Recipes for object types (e.g., a('thing'))
14
+ # create a class definition for that object (Thing) for a variety of reasons. This provides an easy mechanism
15
+ # for returning the class constant given a String, which is often what we have available when working
16
+ # with the template instead of a fixed product of applying that template
17
+ def class_for_name(className);Kernel.const_get(className.capitalize);end;
18
+
19
+
20
+ # given an array of probabilities, return an integer (between 0 and length of probabilities) based on the probabilities passed in
21
+ # in other words, [.9,.05,.05] would return 0 approximately 90% of the time.
22
+ def integer_from_histogram(probabilities)
23
+
24
+ sum = 0
25
+ probabilities.each {|probability| sum = sum + probability}
26
+
27
+ high_end = sum
28
+ random_value = rand * high_end
29
+
30
+ final_idx = 0
31
+ ret_val = probabilities.each_index do |idx|
32
+ cur_prob = probabilities[idx]
33
+
34
+ if random_value <= high_end && random_value > high_end - cur_prob
35
+ final_idx = idx
36
+ break
37
+ else
38
+ high_end = high_end - cur_prob
39
+ next
40
+ end
41
+ end
42
+ final_idx
43
+ end
44
+
45
+ def random_item_with_histogram(list,probabilities)
46
+ raise "#{list.inspect} needs to have as many items as #{probabilities.inspect}" if list.length < probabilities.length
47
+ puts "Warning: #{list.inspect} has more items than #{probabilities.inspect} has items; some items will never be selected." if probabilities.length < list.length
48
+
49
+ # if probabilities is a string, parse it into an array using HistoParse and then recurse
50
+ return random_item_with_histogram(list,parse_histogram(probabilities).histogram_buckets) if !probabilities.nil? && probabilities.kind_of?(String)
51
+
52
+ probabilities.nil? || !probabilities.length ? list.choice : list[integer_from_histogram(probabilities)]
53
+ end
54
+
55
+ def is_template(obj)
56
+ obj.kind_of? Hirsute::Template
57
+ end
58
+
59
+ # refactored code for making/registering a template and class type
60
+ def make_template(objName,&block)
61
+ # define a class with the given name. This is so that we can store class instance variables,
62
+ # present more readable information to users, and so forth. Basically a('thing') should create
63
+ # a class named Thing that can be used elsewhere
64
+ # do this here because template.instance_eval will add to this class if there's an is_stored_in method
65
+ # used
66
+ objClass = Class.new(Hirsute::Fixed)
67
+ Kernel.const_set(objName.capitalize.to_sym,objClass)
68
+
69
+ # construct a new object, set self to that object
70
+ # then yield to the block, which will call methods defined in Template
71
+ template = Hirsute::Template.new(objName)
72
+ if block_given?
73
+ template.instance_eval &block
74
+ end
75
+ return template
76
+
77
+ end
78
+
79
+ # refactored logic for deriving generator from a value
80
+ def generator_from_value(value,&block)
81
+ if value.is_a? Generator
82
+ value
83
+ else
84
+ LiteralGenerator.new(value,block)
85
+ end
86
+ end
87
+
88
+ # Given a range object, select an item randomly from it. This method hashes range -> range.to_a for speed
89
+ def random_from_range(range)
90
+ ary = get_range_array(range)
91
+ end
92
+
93
+ # Gets the array associated with a range from the cache, or adds an entry if it's not there
94
+ # refactored for unit testing
95
+ # Intended to be within the module's eigenclass
96
+ def Support.get_range_array(range)
97
+ ary = @rangeToArray[range]
98
+ if !ary
99
+ ary = range.to_a
100
+ @rangeToArray[range] = ary
101
+ end
102
+ ary
103
+ end
104
+ end
105
+
106
+
107
+
108
+ end
@@ -0,0 +1,47 @@
1
+ # A mini-DSL within Hirsute that can parse ASCII-art histograms for use in Hirsute
2
+ # For instance, it could translate this
3
+ # |****
4
+ # |**
5
+ # |*****
6
+ # into a histogram array of
7
+ # [0.36,0.18,8.45]
8
+ module Hirsute
9
+ module HistoParse
10
+
11
+ def parse_histogram(histogram_string)
12
+ ParsedHistogram.new(histogram_string)
13
+ end
14
+
15
+ #encapsulates the information about a parsed histogram
16
+ class ParsedHistogram
17
+ attr_reader :histogram_buckets
18
+
19
+ def initialize(histogram_string)
20
+ line_regex = /(\*+)/
21
+
22
+ lines = histogram_string.split "\n"
23
+
24
+ # extract information
25
+
26
+ # just those lines that have histogram data
27
+ histo_lines = Array.new
28
+ # parallel array that tracks stars per line
29
+ stars_per_line = Array.new
30
+
31
+ total_stars = 0
32
+
33
+ lines.each do |line|
34
+ next if !(line_regex =~ line)
35
+
36
+ stars = line[line_regex,1]
37
+ total_stars = total_stars + stars.length
38
+ histo_lines << line
39
+ stars_per_line << stars.length
40
+ end
41
+
42
+ @histogram_buckets = stars_per_line.map {|count| count.to_f / total_stars.to_f}
43
+ end
44
+
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,184 @@
1
+ Hirsute: The Manual
2
+ ===================
3
+
4
+ Hirsute is a Ruby DSL for defining rules that yield fake data sets. You can use these fake data sets for examples in an application, testing code against a "normal" (versus cluttered and nonsensical dev) database, or for generating data sets that can be used for load testing an application.
5
+
6
+ Usage
7
+ -----
8
+ ruby lib/hirsute.rb filename
9
+
10
+ By convention, hirsute files end in .hrs, but you can pass any file you'd like to it.
11
+
12
+ Commands are interpreted in a top-down fashion, which means you must define an object type before you use it.
13
+
14
+ Templates
15
+ ------------
16
+ * a/an('_type_') - defines a template for a type of object in the system. You can pass a block of Ruby code which will get executed. Usually this will include _has_ and _is\_stored\_in_. Once you define a template, you can use _type_ as a regular name (e.g., once you've called _a('user')_, you can use _user_ as a language element).
17
+
18
+ <code><pre>
19
+ a('user')
20
+ an('elephant') {
21
+ puts "Made an elephant"
22
+ }
23
+ </pre></code>
24
+
25
+ * has _fields_ - within a template definition, defines the set of fields for that template and the generators that will create the data in a specific instance. See below for a list of generators. Note: The first field => generator pair must be on the same line as _has_
26
+
27
+ <code><pre>
28
+ a('user') {
29
+ has :user_id => 1,
30
+ :is\_online => false
31
+ }
32
+ </pre></code>
33
+
34
+ * transients - within a template definition, defines elements that can be generated per object but won't be stored
35
+
36
+ * is\_stored\_in _name_ - within a template definition, determines the storage destination (e.g., a database table)
37
+
38
+ <code><pre>
39
+ a('user') {
40
+ is\_stored\_in 'app\_users'
41
+ }
42
+ </pre></code>
43
+
44
+ * make - once a template is defined, you can call make on it to create a fixed instance of the object type. If there is _exactly_ one collection holding objects of that type, the new object will automatically get added to it
45
+
46
+ <code><pre>
47
+ a('user')
48
+
49
+ users = user * 6
50
+
51
+ new_user = user.make
52
+
53
+ #users.length now equals 7
54
+ </pre></code>
55
+
56
+ * in\_this\_order - specifies a non-arbitrary ordering of fields into the output files. Especially useful for CSV output where a downstream process expects a certain format
57
+
58
+ Generators
59
+ ----------
60
+ These are the different data generators you can attach to any given field. Note that you can always specify a literal value as well that will always get used as the value for that field. Any time you use a generator, you can also pass it a block of code that will be called with the generated value. For instance, if you want to truncate a string that could be larger than the field it's going into, or add a separator between generated results.
61
+
62
+ If a generator returns another generator, that will be called, and so on. If a generator returns a Range object, a random value from that range will be the ultimate generated value.
63
+
64
+ * one_of (options,histogram) - choose a random item from a list of options. If a histogram is passed in, that is used to determine the probability of picking one option over another. If a histogram is not passed in, all options will be picked with equal probability. Note: Histogram must be no longer than the list. It can be shorter, but than items at the end of the list won't be selected. See below about histograms
65
+
66
+ * counter (startingValue) - keep an incrementing counter so that each new object created from the template gets the next value. Useful for ids and for making unique emails or screen names
67
+
68
+ * combination (generators... ) - combines a variable amount of generators into one field. Results are concatenated together as strings
69
+
70
+ * subset (generators... ) - combines some subset (determined randomly) of the first items in the list
71
+
72
+ * read\_from\_file (filename,algorithm) - reads from a file to produce a value, wrapping around as needed. The default algorithm, :markov, skips ahead a random number of lines each time. :linear, the other supported algorithm, will read from the file in sequence. Note: the filename will be relative to the location of the .hrs file
73
+
74
+ * read\_from\_sequence (array) - reads each item in turn from an array in a continuous loop.
75
+
76
+ * depending\_on(field,possibilities) - use different generators or values depending on the value of some other field in the created object. possibilities is a hash of values to generators or values. Hirsute::DEFAULT can be used to specify a path if the value of the specified field doesn't match any defined option
77
+
78
+ Histograms
79
+ ----------
80
+ One of the main features in Hirsute is the ability to choose randomly based on a non-uniform distribution. A variety of methods in the system allow you to pass in a histogram of probabilities that will be used instead of a uniform spread.
81
+
82
+ You can specify a histogram in two ways: by passing a list of probabilities to the method or by passing a string which can be parsed as a histogram laid out horizontally. For instance, the following two calls are valid:
83
+
84
+ <code><pre>
85
+ one\_of([1,2,3],[0.5,0.2,0.3])
86
+ </pre></code>
87
+
88
+ <code><pre>
89
+ sample\_histogram = <<-HIST
90
+ \*\*\*\*\*
91
+ \*\*\*
92
+ \*\*\*\*\*\*\*
93
+ \*\*\*
94
+ HIST
95
+
96
+ one_of ([1,2,3,4],HIST)
97
+ </pre></code>
98
+
99
+ The histogram parsing code only looks for lines of \* characters. You could thus add comments, axes, or any other information without affecting the parsing.
100
+
101
+ If your histogram has more entries than there are items in the list, Hirsute will raise an exception. If your histogram has fewer entries than your list, it will print out a warning that items at the end of the list will not get selected. Histogram values do not need to add up to one; Hirsute will scale values appropriately.
102
+
103
+
104
+ Collections
105
+ -----------
106
+ Collections can only hold one type of object, but multiple collections can hold the same type of object. A collection supports certain Array methods, such as choice, length, and <<, and also mixes in Enumerable
107
+
108
+ * collection_of *objectType* - create an empty collection of the given object type. You might need to do this when creating mappings to other objects.
109
+
110
+ <code><pre>
111
+ users = collection_of user
112
+ users << user1
113
+ </pre></code>
114
+
115
+ * _template_ * _n_ - create a collection of n objects generated from the template definition.
116
+
117
+ <code><pre>
118
+ a('user')
119
+ users = user * 5 # generates 5 users
120
+ </pre></code>
121
+
122
+ * _collection_ << _template_ - create a new object from the template recipe and append it to the collection.
123
+
124
+ <code><pre>
125
+ a('user')
126
+ users = user * 6
127
+ users << user
128
+ </pre></code>
129
+
130
+ * _collection_ << _object_ - appends the given object to the given collection. Note: collections can only contain one type of object
131
+ * foreach _objectType_ - find every collection that contains the type of object, and iterate through each one in turn. Takes a block that gets each item in turn
132
+
133
+ <code><pre>
134
+ a('user') {
135
+ has :id => counter(1)
136
+ }
137
+ users1 = user * 2
138
+ users2 = user * 1
139
+ foreach user do |item|
140
+ # called a total of 3 times, because all collections with users are included
141
+ end
142
+ </pre></code>
143
+
144
+ * finish(_collection_,_storage_) - output the specified collection based on the given storage type. If no storage type is given, it will use whatever was defined by the storage command
145
+
146
+ * any _type_ - return a single random object of the given type (from any collection that contains that object type). Passing a block that returns a boolean will draw the random object only from ones that meet that criteria
147
+
148
+ <code><pre>
149
+ a('user') {
150
+ has :id => counter(1)
151
+ }
152
+ user_set_1 = user * 20
153
+ user_set_2 = user * 30
154
+ sample_user_1 = any user # user could be from either collection
155
+ sample_user_2 = any user {|cur_user| cur_user.id < 20} # will only pick a random user from the first collection
156
+ </pre></code>
157
+
158
+ * every _type_ - return an array of every element of the specified type (from any collection that contains objects of that type). Passing a block will result in an array that only contains items where the block returns true.
159
+
160
+ <code><pre>
161
+ a('user') {
162
+ has :id => counter(1)
163
+ }
164
+ users_1 = user * 3
165
+ users_2 = user * 7
166
+ every(user) {|cur_user| cur_user.id > 2 && cur_user.id < 5} # returns a subset of users that span the two collections
167
+ </pre></code>
168
+
169
+ Miscellaneous
170
+ -------------
171
+ * storage _type_ - the default storage system to output to. Currently, :mysql and :csv are supported
172
+
173
+ * storage\_options _hash_ - various options to modify the behavior of the storage output
174
+ * :mysql options:
175
+ * :max\_allowed\_packet - the maximum size of the insert created, which is configured for bulk inserts. Defaults to 1048576
176
+ * :csv options:
177
+ * :separator - the character to use between fields. Defaults to ","
178
+
179
+
180
+ * pick\_from(items,probabilities) - Utility method for returning a random item from an array based on an optional histogram. If the histogram is not passed in, a random item will be chosen based on a uniform distribution. Otherwise, the passed-in histogram will be used to determine the probability of any item being returned.
181
+
182
+
183
+
184
+
@@ -0,0 +1,44 @@
1
+ # the script used in the readme
2
+
3
+ storage :mysql
4
+
5
+ a('user') {
6
+ has :id => counter,
7
+ :email => combination(
8
+ "testuser",counter,"@",one_of(['gmail','aol','yahoo']),".com")
9
+ is_stored_in "users"
10
+ }
11
+
12
+ #make 1000 users
13
+ users = user * 1000
14
+
15
+ # define a friendship object that maps two users together. Just define the user ids as literals so they can be defined but can be filled in later
16
+ a('friendship') {
17
+ has :user1 => 1,
18
+ :user2 => 1
19
+ is_stored_in "friendship"
20
+ }
21
+ friendships = collection_of friendship
22
+
23
+ # for each user, pick an appropriate number of friends and create the friendship objects
24
+ foreach user do |cur_user|
25
+ # figure out a number of friends this user might have. Pass in a histogram to steer the probability the way we want
26
+ # the first argument is the options to draw from, the second argument (optional) is a histogram representing distribution of probabilities
27
+ num_friends = pick_from([0,1,2,3,4,5,6,7,8,9,10],
28
+ [0.02,0.1,0.3,0.3,0.2,0.01,0.01,0.01,0.01,0.02,0.01]
29
+ )
30
+
31
+ # since this in Ruby, you can just write in it as needed
32
+ (0...num_friends).each do |idx|
33
+ # grab a random user that isn't this one
34
+ friend = any(user) {|friend| friend.id != cur_user.id}
35
+
36
+ new_friendship = friendship.make # because there's only one collection holding these, it's added automatically
37
+ new_friendship.user1 = friend.id
38
+ new_friendship.user2 = cur_user.id
39
+ end
40
+ end
41
+
42
+ # and now write them all out to files
43
+ finish users
44
+ finish friendships
@@ -0,0 +1,145 @@
1
+ # A sample data definition for a fictional wine cellar management application
2
+ # As a wine accumulator, I should say that this is an inadequate model for such an application, but it suffices to give
3
+ #
4
+ # The basic model is:
5
+ # bottle - canonical form of a wine bottle description
6
+ # user - a user of the system
7
+ # cellar - a collection of bottles
8
+ # tasting_note - one user's perceptions of a bottle (which may or may not be in the user's cellar)
9
+ storage :mysql
10
+
11
+ #define the users, who have unique screen names by virtue of the counter, but are otherwise random combinations of common wine forum terms
12
+ a('user') {
13
+ has :user_id => counter(1),
14
+ :screen_name => combination(
15
+ one_of(["Wine","oeno","eno","Vino"]),
16
+ one_of(["Lover","Taster","phile"]),
17
+ counter(1))
18
+ is_stored_in "users"
19
+ }
20
+
21
+ # create 1000 new users, each based off the template defined above
22
+ users = user * 1000
23
+
24
+ # a bottle represents a normalized definition of a bottle. It has an id, a producer (pulled randomly from files), a vintage that is probably 1980s or 1990s
25
+ # -- the types of bottles one might put away for a while -- up to four grapes, and a country of origin
26
+ a('bottle') {
27
+ has :bottle_id => counter(1),
28
+ :producer => subset(
29
+ read_from_file('wine_cellar_winery_firsts.txt') {|text| text + " "},
30
+ read_from_file('wine_cellar_winery_seconds.txt') {|text| text + " "},
31
+ read_from_file('wine_cellar_winery_thirds.txt') {|text| text + " "}) {|result| result.strip},
32
+ :vintage => one_of([1960...1970,1970...1980,1980...1990,1990...2000,2000...2010],[0.05,0.05,0.2,0.4,0.3]), # different vintages have different likelihoods, so use that accordingly
33
+ :grapes => depending_on(:country,
34
+ 'Italy' =>
35
+ subset(read_from_file('wine_cellar_italian_grapes.txt') {|text| text + ","},
36
+ read_from_file('wine_cellar_italian_grapes.txt') {|text| text + ","},
37
+ read_from_file('wine_cellar_italian_grapes.txt') {|text| text + ","},
38
+ read_from_file('wine_cellar_italian_grapes.txt') {|text| text + ","}),
39
+ Hirsute::DEFAULT =>
40
+ subset(read_from_file('wine_cellar_grapes.txt') {|text| text + ","},
41
+ read_from_file('wine_cellar_grapes.txt') {|text| text + ","},
42
+ read_from_file('wine_cellar_grapes.txt') {|text| text + ","},
43
+ read_from_file('wine_cellar_grapes.txt') {|text| text + ","}) {|all_grapes| all_grapes[0...all_grapes.length]}),
44
+ :country => one_of(['France','Germany','Italy','Austria','United States'])
45
+
46
+ is_stored_in 'bottles'
47
+ }
48
+
49
+ # This represents a user's cellar. We give each one its own id, but then use a literal for the user_id, because we'll fill it in later
50
+ a('cellar') {
51
+ has :cellar_id => counter(1),
52
+ :user_id => 1 # this will be set later
53
+ is_stored_in "cellars"
54
+ }
55
+
56
+ # create an empty collection, because we'll fill it in as we go
57
+ cellars = collection_of cellar
58
+
59
+ # give each user a cellar
60
+ foreach user do |cur_user|
61
+ user_cellar = cellar.make
62
+ user_cellar.user_id = cur_user.user_id
63
+ end
64
+
65
+
66
+ # This creates an object that would populate a join table (which also includes a field for the amount of bottles on hand)
67
+ # The ids are kept as literals because they'll be filled in later by the code that makes each one
68
+ a('bottleCellar') {
69
+ has :cellar_id => 1, # this is set later
70
+ :bottle_id => 1, # set later
71
+ :amount => one_of([1,5,12,24],[0.3,0.3,0.3,0.1]) {|count| count + rand(4)} # assume that people generally have around one or five instances of a bottle, less often a case
72
+ is_stored_in 'bottle_cellar'
73
+ }
74
+ # now give each cellar some number of bottles
75
+ bottleCellars = collection_of bottleCellar
76
+
77
+
78
+
79
+ # This defines a tasting note that a single user might write about a single bottle. It pulls descriptors from various files.
80
+ star_ratings = <<-HIST
81
+ 1 | **
82
+ |
83
+ 2 | **
84
+ |
85
+ 3 | *********
86
+ |
87
+ 4 | *********
88
+ |
89
+ 5 | *****
90
+ HIST
91
+
92
+ a('tastingNote') {
93
+ has :tasting_note_id => counter(1),
94
+ :description => combination(
95
+ subset(
96
+ read_from_file('wine_cellar_aromas.txt') {|text| text + ","},
97
+ read_from_file('wine_cellar_aromas.txt') {|text| text + ","},
98
+ read_from_file('wine_cellar_aromas.txt') {|text| text + ","},
99
+ read_from_file('wine_cellar_aromas.txt') {|text| text + ","},
100
+ read_from_file('wine_cellar_aromas.txt') {|text| text + ","},
101
+ read_from_file('wine_cellar_aromas.txt') {|text| text + ","}),
102
+ subset(
103
+ read_from_file('wine_cellar_flavors.txt') {|text| text + ","},
104
+ read_from_file('wine_cellar_flavors.txt') {|text| text + ","},
105
+ read_from_file('wine_cellar_flavors.txt') {|text| text + ","})
106
+ ),
107
+ :rating => one_of([1,2,3,4,5],star_ratings),
108
+ :bottle_id => 1, # filled in later
109
+ :user_id => 1 # filled in later
110
+ is_stored_in 'tasting_note'
111
+
112
+ }
113
+
114
+ tastingNotes = collection_of tastingNote
115
+
116
+ # make 100 different bottles of wine
117
+ bottles = bottle * 100
118
+
119
+ # for every cellar, add a bunch of bottles with tasting notes
120
+ foreach cellar do |cur_cellar|
121
+ # a cellar might have 10, 20 (most likely), or 40 different (distinct) bottles, with some flux
122
+ count = pick_from([10,20,40],[0.2,0.6,0.2]) + rand(10)
123
+
124
+ (1..count).each do |amount|
125
+ cur_bottle = any bottle
126
+
127
+ # attach this bottle to the cellar
128
+ cur_bottle_cellar = bottleCellar.make # note that this will fill in an amount of bottles within the cellar
129
+ cur_bottle_cellar.bottle_id = cur_bottle.bottle_id
130
+ cur_bottle_cellar.cellar_id = cur_cellar.cellar_id
131
+
132
+ # make a tasting note that a user might have entered for the bottle
133
+ tasting_note = tastingNote.make
134
+ tasting_note.bottle_id = cur_bottle.bottle_id
135
+ tasting_note.user_id = cur_cellar.user_id
136
+ end
137
+ end
138
+
139
+
140
+
141
+ finish users
142
+ finish bottles
143
+ finish cellars
144
+ finish bottleCellars
145
+ finish tastingNotes