RubyGems - hirsute - Versions diffs - 0.1.0 - Mend

hirsute 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +7 -0
data/MIT_LICENSE +7 -0
data/README.md +105 -0
data/bin/hirsute +14 -0
data/lib/hirsute.rb +110 -0
data/lib/hirsute_collection.rb +67 -0
data/lib/hirsute_constraint.rb +17 -0
data/lib/hirsute_fixed.rb +17 -0
data/lib/hirsute_generator.rb +140 -0
data/lib/hirsute_make_generators.rb +125 -0
data/lib/hirsute_output.rb +142 -0
data/lib/hirsute_template.rb +159 -0
data/lib/hirsute_test.rb +25 -0
data/lib/hirsute_utils.rb +108 -0
data/lib/histoparse.rb +47 -0
data/manual.md +184 -0
data/samples/readme.hrs +44 -0
data/samples/wine_cellar.hrs +145 -0
data/tests/first_names.txt +8 -0
data/tests/hirsute_test.rb +362 -0
data/tests/histoparse_test.rb +32 -0
metadata +72 -0

data/lib/hirsute_utils.rb ADDED

@@ -0,0 +1,108 @@
+# various utility methods
+require 'lib/histoparse.rb'
+module Hirsute
+  module Support
+    include Hirsute::HistoParse
+    # store a map of range objects to its constituent array. But we want to store it within the Module's eigenclass so it's shared
+    # across the code base
+    Hirsute::Support.instance_eval {@rangeToArray = Hash.new}
+    # return the class object for the given string. Recipes for object types (e.g., a('thing'))
+    # create a class definition for that object (Thing) for a variety of reasons. This provides an easy mechanism
+    # for returning the class constant given a String, which is often what we have available when working
+    # with the template instead of a fixed product of applying that template
+    def class_for_name(className);Kernel.const_get(className.capitalize);end;
+    # given an array of probabilities, return an integer (between 0 and length of probabilities) based on the probabilities passed in
+    # in other words, [.9,.05,.05] would return 0 approximately 90% of the time.
+    def integer_from_histogram(probabilities)
+      sum = 0
+      probabilities.each {|probability| sum = sum + probability}
+      high_end = sum
+      random_value = rand * high_end
+      final_idx = 0
+      ret_val = probabilities.each_index do |idx|
+        cur_prob = probabilities[idx]
+        if random_value <= high_end && random_value > high_end - cur_prob
+           final_idx = idx
+           break
+        else
+            high_end = high_end - cur_prob
+            next
+        end
+      end
+      final_idx
+   end
+   def random_item_with_histogram(list,probabilities)
+     raise "#{list.inspect} needs to have as many items as #{probabilities.inspect}" if list.length < probabilities.length
+     puts "Warning: #{list.inspect} has more items than #{probabilities.inspect} has items; some items will never be selected." if probabilities.length < list.length
+     # if probabilities is a string, parse it into an array using HistoParse and then recurse
+     return random_item_with_histogram(list,parse_histogram(probabilities).histogram_buckets) if !probabilities.nil? && probabilities.kind_of?(String)
+     probabilities.nil? || !probabilities.length ? list.choice : list[integer_from_histogram(probabilities)]
+   end
+   def is_template(obj)
+     obj.kind_of? Hirsute::Template
+   end
+   # refactored code for making/registering a template and class type
+   def make_template(objName,&block)
+     # define a class with the given name. This is so that we can store class instance variables,
+     # present more readable information to users, and so forth. Basically a('thing') should create
+     # a class named Thing that can be used elsewhere
+     # do this here because template.instance_eval will add to this class if there's an is_stored_in method
+     # used
+     objClass = Class.new(Hirsute::Fixed)
+     Kernel.const_set(objName.capitalize.to_sym,objClass)
+     # construct a new object, set self to that object
+     # then yield to the block, which will call methods defined in Template
+     template = Hirsute::Template.new(objName)
+     if block_given?
+       template.instance_eval &block
+     end
+     return template
+   end
+   # refactored logic for deriving generator from a value
+   def generator_from_value(value,&block)
+      if value.is_a? Generator
+         value
+      else
+         LiteralGenerator.new(value,block)
+      end
+   end
+   # Given a range object, select an item randomly from it. This method hashes range -> range.to_a for speed
+   def random_from_range(range)
+     ary = get_range_array(range)
+   end
+   # Gets the array associated with a range from the cache, or adds an entry if it's not there
+   # refactored for unit testing
+   # Intended to be within the module's eigenclass
+   def Support.get_range_array(range)
+     ary = @rangeToArray[range]
+     if !ary
+        ary = range.to_a
+        @rangeToArray[range] = ary
+     end
+     ary
+   end
+  end
+end

data/lib/histoparse.rb ADDED

@@ -0,0 +1,47 @@
+# A mini-DSL within Hirsute that can parse ASCII-art histograms for use in Hirsute
+# For instance, it could translate this
+# |****
+# |**
+# |*****
+# into a histogram array of
+# [0.36,0.18,8.45]
+module Hirsute
+  module HistoParse
+    def parse_histogram(histogram_string)
+      ParsedHistogram.new(histogram_string)
+    end
+    #encapsulates the information about a parsed histogram
+    class ParsedHistogram
+      attr_reader :histogram_buckets
+      def initialize(histogram_string)
+        line_regex = /(\*+)/
+        lines = histogram_string.split "\n"
+        # extract information
+        # just those lines that have histogram data
+        histo_lines = Array.new
+        # parallel array that tracks stars per line
+        stars_per_line = Array.new
+        total_stars = 0
+        lines.each do |line|
+          next if !(line_regex =~ line)
+          stars = line[line_regex,1]
+          total_stars = total_stars + stars.length
+          histo_lines << line
+          stars_per_line << stars.length
+        end
+        @histogram_buckets = stars_per_line.map {|count| count.to_f / total_stars.to_f}
+      end
+    end
+  end
+end

data/manual.md ADDED

@@ -0,0 +1,184 @@
+Hirsute: The Manual
+===================
+Hirsute is a Ruby DSL for defining rules that yield fake data sets. You can use these fake data sets for examples in an application, testing code against a "normal" (versus cluttered and nonsensical dev) database, or for generating data sets that can be used for load testing an application.
+Usage
+-----
+ruby lib/hirsute.rb filename
+By convention, hirsute files end in .hrs, but you can pass any file you'd like to it.
+Commands are interpreted in a top-down fashion, which means you must define an object type before you use it.
+Templates
+------------
+* a/an('_type_') - defines a template for a type of object in the system. You can pass a block of Ruby code which will get executed. Usually this will include _has_ and _is\_stored\_in_. Once you define a template, you can use _type_ as a regular name (e.g., once you've called _a('user')_, you can use _user_ as a language element).
+<code><pre>
+    a('user')
+    an('elephant') {
+        puts "Made an elephant"
+    }
+</pre></code>
+* has _fields_ - within a template definition, defines the set of fields for that template and the generators that will create the data in a specific instance. See below for a list of generators. Note: The first field => generator pair must be on the same line as _has_
+<code><pre>
+    a('user') {
+        has :user_id => 1,
+            :is\_online => false
+    }
+</pre></code>
+* transients - within a template definition, defines elements that can be generated per object but won't be stored
+* is\_stored\_in _name_ - within a template definition, determines the storage destination (e.g., a database table)
+<code><pre>
+    a('user') {
+        is\_stored\_in 'app\_users'
+    }
+</pre></code>
+* make - once a template is defined, you can call make on it to create a fixed instance of the object type. If there is _exactly_ one collection holding objects of that type, the new object will automatically get added to it
+<code><pre>
+    a('user')
+    users = user * 6
+    new_user = user.make
+    #users.length now equals 7
+</pre></code>
+* in\_this\_order - specifies a non-arbitrary ordering of fields into the output files. Especially useful for CSV output where a downstream process expects a certain format
+Generators
+----------
+These are the different data generators you can attach to any given field. Note that you can always specify a literal value as well that will always get used as the value for that field. Any time you use a generator, you can also pass it a block of code that will be called with the generated value. For instance, if you want to truncate a string that could be larger than the field it's going into, or add a separator between generated results.
+If a generator returns another generator, that will be called, and so on. If a generator returns a Range object, a random value from that range will be the ultimate generated value.
+* one_of (options,histogram) - choose a random item from a list of options. If a histogram is passed in, that is used to determine the probability of picking one option over another. If a histogram is not passed in, all options will be picked with equal probability. Note: Histogram must be no longer than the list. It can be shorter, but than items at the end of the list won't be selected. See below about histograms
+* counter (startingValue) - keep an incrementing counter so that each new object created from the template gets the next value. Useful for ids and for making unique emails or screen names
+* combination (generators... ) - combines a variable amount of generators into one field. Results are concatenated together as strings
+* subset (generators... ) - combines some subset (determined randomly) of the first items in the list
+* read\_from\_file (filename,algorithm) - reads from a file to produce a value, wrapping around as needed. The default algorithm, :markov, skips ahead a random number of lines each time. :linear, the other supported algorithm, will read from the file in sequence. Note: the filename will be relative to the location of the .hrs file
+* read\_from\_sequence (array) - reads each item in turn from an array in a continuous loop.
+* depending\_on(field,possibilities) - use different generators or values depending on the value of some other field in the created object. possibilities is a hash of values to generators or values. Hirsute::DEFAULT can be used to specify a path if the value of the specified field doesn't match any defined option
+Histograms
+----------
+One of the main features in Hirsute is the ability to choose randomly based on a non-uniform distribution. A variety of methods in the system allow you to pass in a histogram of probabilities that will be used instead of a uniform spread.
+You can specify a histogram in two ways: by passing a list of probabilities to the method or by passing a string which can be parsed as a histogram laid out horizontally. For instance, the following two calls are valid:
+<code><pre>
+    one\_of([1,2,3],[0.5,0.2,0.3])
+</pre></code>
+<code><pre>
+    sample\_histogram = <<-HIST
+       \*\*\*\*\*
+       \*\*\*
+       \*\*\*\*\*\*\*
+       \*\*\*
+    HIST
+    one_of ([1,2,3,4],HIST)
+</pre></code>
+The histogram parsing code only looks for lines of \* characters. You could thus add comments, axes, or any other information without affecting the parsing.
+If your histogram has more entries than there are items in the list, Hirsute will raise an exception. If your histogram has fewer entries than your list, it will print out a warning that items at the end of the list will not get selected. Histogram values do not need to add up to one; Hirsute will scale values appropriately.
+Collections
+-----------
+Collections can only hold one type of object, but multiple collections can hold the same type of object. A collection supports certain Array methods, such as choice, length, and <<, and also mixes in Enumerable
+* collection_of *objectType* - create an empty collection of the given object type. You might need to do this when creating mappings to other objects.
+<code><pre>
+    users = collection_of user
+    users << user1
+</pre></code>
+* _template_ * _n_ - create a collection of n objects generated from the template definition.
+<code><pre>
+    a('user')
+    users = user * 5 # generates 5 users
+</pre></code>
+* _collection_ << _template_ - create a new object from the template recipe and append it to the collection.
+<code><pre>
+   a('user')
+   users = user * 6
+   users << user
+ </pre></code>
+* _collection_ << _object_ - appends the given object to the given collection. Note: collections can only contain one type of object
+* foreach _objectType_ - find every collection that contains the type of object, and iterate through each one in turn. Takes a block that gets each item in turn
+<code><pre>
+    a('user') {
+        has :id => counter(1)
+    }
+    users1 = user * 2
+    users2 = user * 1
+    foreach user do |item|
+       # called a total of 3 times, because all collections with users are included
+    end
+</pre></code>
+* finish(_collection_,_storage_) - output the specified collection based on the given storage type. If no storage type is given, it will use whatever was defined by the storage command
+* any _type_ - return a single random object of the given type (from any collection that contains that object type). Passing a block that returns a boolean will draw the random object only from ones that meet that criteria
+<code><pre>
+    a('user') {
+        has :id => counter(1)
+    }
+    user_set_1 = user * 20
+    user_set_2 = user * 30
+    sample_user_1 = any user # user could be from either collection
+    sample_user_2 = any user {|cur_user| cur_user.id < 20} # will only pick a random user from the first collection
+</pre></code>
+* every _type_ - return an array of every element of the specified type (from any collection that contains objects of that type). Passing a block will result in an array that only contains items where the block returns true.
+<code><pre>
+    a('user') {
+        has :id => counter(1)
+    }
+    users_1 = user * 3
+    users_2 = user * 7
+    every(user) {|cur_user| cur_user.id > 2 && cur_user.id < 5} # returns a subset of users that span the two collections
+</pre></code>
+Miscellaneous
+-------------
+* storage _type_ - the default storage system to output to. Currently, :mysql and :csv are supported
+* storage\_options _hash_ - various options to modify the behavior of the storage output
+    * :mysql options:
+        * :max\_allowed\_packet - the maximum size of the insert created, which is configured for bulk inserts. Defaults to 1048576
+    * :csv options:
+        * :separator - the character to use between fields. Defaults to ","
+* pick\_from(items,probabilities) - Utility method for returning a random item from an array based on an optional histogram. If the histogram is not passed in, a random item will be chosen based on a uniform distribution. Otherwise, the passed-in histogram will be used to determine the probability of any item being returned.

data/samples/readme.hrs ADDED

@@ -0,0 +1,44 @@
+# the script used in the readme
+storage :mysql
+a('user') {
+    has :id => counter,
+        :email => combination(
+                  "testuser",counter,"@",one_of(['gmail','aol','yahoo']),".com")
+    is_stored_in "users"
+}
+#make 1000 users
+users = user * 1000
+# define a friendship object that maps two users together. Just define the user ids as literals so they can be defined but can be filled in later
+a('friendship') {
+    has :user1 => 1,
+        :user2 => 1
+    is_stored_in "friendship"
+}
+friendships = collection_of friendship
+# for each user, pick an appropriate number of friends and create the friendship objects
+foreach user do |cur_user|
+  # figure out a number of friends this user might have. Pass in a histogram to steer the probability the way we want
+  # the first argument is the options to draw from, the second argument (optional) is a histogram representing distribution of probabilities
+  num_friends = pick_from([0,1,2,3,4,5,6,7,8,9,10],
+                           [0.02,0.1,0.3,0.3,0.2,0.01,0.01,0.01,0.01,0.02,0.01]
+                          )
+  # since this in Ruby, you can just write in it as needed
+  (0...num_friends).each do |idx|
+     # grab a random user that isn't this one
+     friend = any(user) {|friend| friend.id != cur_user.id}
+     new_friendship = friendship.make # because there's only one collection holding these, it's added automatically
+     new_friendship.user1 = friend.id
+     new_friendship.user2 = cur_user.id
+  end
+end
+# and now write them all out to files
+finish users
+finish friendships

data/samples/wine_cellar.hrs ADDED

@@ -0,0 +1,145 @@
+# A sample data definition for a fictional wine cellar management application
+# As a wine accumulator, I should say that this is an inadequate model for such an application, but it suffices to give
+#
+# The basic model is:
+#    bottle - canonical form of a wine bottle description
+#    user - a user of the system
+#    cellar - a collection of bottles
+#    tasting_note - one user's perceptions of a bottle (which may or may not be in the user's cellar)
+storage :mysql
+#define the users, who have unique screen names by virtue of the counter, but are otherwise random combinations of common wine forum terms
+a('user') {
+  has :user_id => counter(1),
+      :screen_name => combination(
+                         one_of(["Wine","oeno","eno","Vino"]),
+                         one_of(["Lover","Taster","phile"]),
+                         counter(1))
+  is_stored_in "users"
+}
+# create 1000 new users, each based off the template defined above
+users = user * 1000
+# a bottle represents a normalized definition of a bottle. It has an id, a producer (pulled randomly from files), a vintage that is probably 1980s or 1990s
+# -- the types of bottles one might put away for a while -- up to four grapes, and a country of origin
+a('bottle') {
+  has :bottle_id => counter(1),
+      :producer => subset(
+                     read_from_file('wine_cellar_winery_firsts.txt') {|text| text + " "},
+                     read_from_file('wine_cellar_winery_seconds.txt') {|text| text + " "},
+                     read_from_file('wine_cellar_winery_thirds.txt') {|text| text + " "}) {|result| result.strip},
+      :vintage => one_of([1960...1970,1970...1980,1980...1990,1990...2000,2000...2010],[0.05,0.05,0.2,0.4,0.3]), # different vintages have different likelihoods, so use that accordingly
+      :grapes => depending_on(:country,
+                       'Italy' =>
+                         subset(read_from_file('wine_cellar_italian_grapes.txt') {|text| text + ","},
+                            read_from_file('wine_cellar_italian_grapes.txt') {|text| text + ","},
+                            read_from_file('wine_cellar_italian_grapes.txt') {|text| text + ","},
+                            read_from_file('wine_cellar_italian_grapes.txt') {|text| text + ","}),
+                        Hirsute::DEFAULT =>
+                         subset(read_from_file('wine_cellar_grapes.txt') {|text| text + ","},
+                          read_from_file('wine_cellar_grapes.txt') {|text| text + ","},
+                          read_from_file('wine_cellar_grapes.txt') {|text| text + ","},
+                          read_from_file('wine_cellar_grapes.txt') {|text| text + ","}) {|all_grapes| all_grapes[0...all_grapes.length]}),
+      :country => one_of(['France','Germany','Italy','Austria','United States'])
+  is_stored_in 'bottles'
+}
+# This represents a user's cellar. We give each one its own id, but then use a literal for the user_id, because we'll fill it in later
+a('cellar') {
+  has :cellar_id => counter(1),
+      :user_id => 1 # this will be set later
+  is_stored_in "cellars"
+}
+# create an empty collection, because we'll fill it in as we go
+cellars = collection_of cellar
+# give each user a cellar
+foreach user do |cur_user|
+  user_cellar = cellar.make
+  user_cellar.user_id = cur_user.user_id
+end
+# This creates an object that would populate a join table (which also includes a field for the amount of bottles on hand)
+# The ids are kept as literals because they'll be filled in later by the code that makes each one
+a('bottleCellar') {
+  has :cellar_id => 1, # this is set later
+      :bottle_id => 1,  # set later
+      :amount => one_of([1,5,12,24],[0.3,0.3,0.3,0.1]) {|count| count + rand(4)} # assume that people generally have around one or five instances of a bottle, less often a case
+  is_stored_in 'bottle_cellar'
+}
+# now give each cellar some number of bottles
+bottleCellars = collection_of bottleCellar
+# This defines a tasting note that a single user might write about a single bottle. It pulls descriptors from various files.
+star_ratings = <<-HIST
+  1 | **
+    |
+  2 | **
+    |
+  3 | *********
+    |
+  4 | *********
+    |
+  5 | *****
+HIST
+a('tastingNote') {
+  has :tasting_note_id => counter(1),
+      :description => combination(
+         subset(
+           read_from_file('wine_cellar_aromas.txt') {|text| text + ","},
+           read_from_file('wine_cellar_aromas.txt') {|text| text + ","},
+           read_from_file('wine_cellar_aromas.txt') {|text| text + ","},
+           read_from_file('wine_cellar_aromas.txt') {|text| text + ","},
+           read_from_file('wine_cellar_aromas.txt') {|text| text + ","},
+           read_from_file('wine_cellar_aromas.txt') {|text| text + ","}),
+         subset(
+           read_from_file('wine_cellar_flavors.txt') {|text| text + ","},
+           read_from_file('wine_cellar_flavors.txt') {|text| text + ","},
+           read_from_file('wine_cellar_flavors.txt') {|text| text + ","})
+         ),
+      :rating => one_of([1,2,3,4,5],star_ratings),
+      :bottle_id => 1, # filled in later
+      :user_id => 1    # filled in later
+    is_stored_in 'tasting_note'
+}
+tastingNotes = collection_of tastingNote
+# make 100 different bottles of wine
+bottles = bottle * 100
+# for every cellar, add a bunch of bottles with tasting notes
+foreach cellar do |cur_cellar|
+  # a cellar might have 10, 20 (most likely), or 40 different (distinct) bottles, with some flux
+  count = pick_from([10,20,40],[0.2,0.6,0.2]) + rand(10)
+  (1..count).each do |amount|
+    cur_bottle = any bottle
+    # attach this bottle to the cellar
+    cur_bottle_cellar = bottleCellar.make # note that this will fill in an amount of bottles within the cellar
+    cur_bottle_cellar.bottle_id = cur_bottle.bottle_id
+    cur_bottle_cellar.cellar_id = cur_cellar.cellar_id
+    # make a tasting note that a user might have entered for the bottle
+    tasting_note = tastingNote.make
+    tasting_note.bottle_id = cur_bottle.bottle_id
+    tasting_note.user_id = cur_cellar.user_id
+  end
+end
+finish users
+finish bottles
+finish cellars
+finish bottleCellars
+finish tastingNotes