mddb 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. data/bin/aggregater +71 -0
  2. data/bin/average +59 -0
  3. data/bin/binner +95 -0
  4. data/lib/mddb/version.rb +1 -1
  5. metadata +16 -10
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env ruby
2
+ require "#{__FILE__}/../stat"
3
+ require 'micro-optparse'
4
+
5
+ @options = Parser.new do |p|
6
+ p.banner = "Binning, see below for options"
7
+ p.version = "0.01"
8
+ p.option :live, "print the live status", :default => false
9
+ p.option :header, "use if the stream contains a header", :default => false
10
+ p.option :delimiter, "specify the delimiting character", :default => "tsv", :value_in_set => ['tsv', 'csv', 'ssv']
11
+ p.option :drop, "Drop lines that are poorly formatted, instead of failing", :default => false
12
+ p.option :keys, "Number of columns to use as keys", :default => 1, :value_satisfies => lambda {|x| x > 0 }
13
+ end.process!
14
+
15
+ def header?
16
+ @options[:header] && ARGF.lineno == 1
17
+ end
18
+
19
+ catch :shit_stream do
20
+ input = 0
21
+ output = 0
22
+ stats = Stats.new "Aggregate", :input, :output, :ratio, :elapsed
23
+ last = @options[:keys] - 1
24
+ old_key = nil
25
+ totals = []
26
+ ARGF.each do |line|
27
+ stats.clock if ARGF.lineno == 1
28
+ input += 1
29
+ line.chomp!
30
+ if header?
31
+ puts line
32
+ output += 1
33
+ else
34
+ cols = line.split("\t")
35
+ cols = cols[0..(last)] +cols[(last+1)..(cols.count-1)].map {|d| d.to_f}
36
+
37
+ # Set the key
38
+ key = cols[0..last]
39
+
40
+ # First time round we set the old key to equal the new one
41
+ if old_key.nil?
42
+ totals = Array.new((cols.count-1-last),0)
43
+ old_key = key
44
+ end
45
+
46
+ # If the key is new, print the old key and its aggregates,
47
+ # otherwise aggregate the total for this key
48
+ if key == old_key
49
+ (cols.count - last - 1).times do |i|
50
+ totals[i] += cols[i+last+1]
51
+ end
52
+ end
53
+
54
+ if key != old_key
55
+ puts old_key.map {|k| k.to_s}.join("\t") + "\t" + totals.map {|d| d.to_s}.join("\t")
56
+ old_key = key
57
+ # Reset the total
58
+ totals = Array.new((cols.count-1-last),0)
59
+ (cols.count - last - 1).times do |i|
60
+ totals[i] += cols[i+last+1]
61
+ end
62
+ output += 1
63
+ end
64
+
65
+ stats.show(input,output) if @options[:live]
66
+ end
67
+ end
68
+ output += 1
69
+ puts old_key.map {|k| k.to_s}.join("\t") + "\t" + totals.map {|d| d.to_s}.join("\t")
70
+ stats.finish(input,output)
71
+ end
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+ require "#{__FILE__}/../stat"
3
+ require 'micro-optparse'
4
+
5
+ valid_columns = lambda {|x|
6
+ valid = true
7
+ x.each {|d|
8
+ valid = false unless d.to_i > 0
9
+ }
10
+ valid
11
+ }
12
+
13
+ @options = Parser.new do |p|
14
+ p.banner = "Average, see below for options"
15
+ p.version = "0.01"
16
+ p.option :live, "print the live status", :default => false
17
+ p.option :header, "use if the stream contains a header", :default => false
18
+ p.option :delimiter, "specify the delimiting character", :default => "tsv", :value_in_set => ['tsv', 'csv', 'ssv']
19
+ p.option :drop, "Drop lines that are poorly formatted, instead of failing", :default => false
20
+ p.option :columns, "The columns whose average you wish to obtain, columns are integers starting at 1", :default => [], :value_satisfies => valid_columns
21
+ end.process!
22
+
23
+ def header?
24
+ @options[:header] && ARGF.lineno == 1
25
+ end
26
+
27
+ columns = @options[:columns].map {|d| d.to_i - 1}
28
+ max_col = columns.max
29
+ input = 0.0
30
+ output = 0.0
31
+ stat = Stats.new("Average", :input, :output, :elapsed)
32
+ catch :shit_happened do
33
+ options = []
34
+
35
+ ARGF.each do |line|
36
+ stat.clock if ARGF.lineno == 1
37
+ input += 1
38
+ if header?
39
+ puts line
40
+ output += 1
41
+ else
42
+ cols = line.chomp.split("\t").map {|d| d.to_f}
43
+
44
+ if (max_col + 1) > cols.length
45
+ puts "There are only #{cols.length} columns, but you said there was a column #{max_col + 1}"
46
+ throw :shit_happened
47
+ end
48
+
49
+ columns.each do |option|
50
+ cols[option] = cols[option]/cols.last
51
+ end
52
+
53
+ puts cols.map {|c| c.to_s}.join("\t")
54
+ output += 1
55
+ stat.show(input,output) if @options[:live]
56
+ end
57
+ end
58
+ end
59
+ stat.finish(input,output)
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/env ruby
2
+ require 'micro-optparse'
3
+ require "#{__FILE__}/../stat.rb"
4
+
5
+ class String
6
+ def is_numeric?
7
+ true if Float(self) rescue false
8
+ end
9
+ end
10
+
11
+ all_are_numeric = lambda {|x|
12
+ valid = true
13
+ x.each {|x| valid = false unless x.is_numeric?}
14
+ x.each {|x| valid = false unless x.to_f >= 0 }
15
+ valid
16
+ }
17
+
18
+ @options = Parser.new do |p|
19
+ p.banner = "Binning, see below for options"
20
+ p.version = "0.01"
21
+ p.option :live, "print the live status", :default => false
22
+ p.option :header, "use if the stream contains a header", :default => false
23
+ p.option :delimiter, "specify the delimiting character", :default => "tsv", :value_in_set => ['tsv', 'csv', 'ssv']
24
+ p.option :emit, "Emit 1 for map-reduce", :default => false
25
+ p.option :drop, "Drop lines that are poorly formatted, instead of failing", :default => false
26
+ p.option :bins, "bin width for each column, a width of 0 leaves a column unchanged", :default => [],
27
+ :value_satisfies => all_are_numeric
28
+ end.process!
29
+
30
+ @options[:bins].map! {|d| d.to_f}
31
+
32
+ @delimiters = {'tsv' => "\t", 'csv' => ",", 'ssv' => " "}
33
+
34
+ @delimiter = @delimiters[@options[:delimiter]]
35
+
36
+ def header?
37
+ @options[:header] && ARGF.lineno == 1
38
+ end
39
+
40
+ def bin value, width
41
+ if width == 0.0
42
+ value
43
+ else
44
+ ((value/width).floor * width).round(6)
45
+ end
46
+ end
47
+
48
+ def emit
49
+ @options[:emit]? @delimiter + "1" : ""
50
+ end
51
+
52
+ def emit_header
53
+ @options[:emit]? @delimiter + "count" : ""
54
+ end
55
+
56
+ def columns line
57
+ line.split(@delimiter)
58
+ end
59
+
60
+ def drop? line
61
+ if line.length != @options[:bins].length
62
+ if @options[:drop]
63
+ true
64
+ else
65
+ $stderr.puts "There were #{line.length} columns on line #{ARGF.lineno}, but you gave #{@options[:bins].length} bins"
66
+ $stderr.puts "Perhaps this wasn't a #{@options[:delimiter]}. Try the -d --help for a list of filetypes"
67
+ $stderr.puts "If you think that some lines might be an issue, use the -r flag to drop bad lines"
68
+ throw :dropped
69
+ end
70
+ else
71
+ false
72
+ end
73
+ end
74
+
75
+ catch :dropped do
76
+ input = 0
77
+ output = 0
78
+ stats = Stats.new "Binning", :input, :output, :dropped, :elapsed
79
+ ARGF.each do |line|
80
+ input += 1
81
+ row = line.chomp.split(@delimiter)
82
+ unless drop? row
83
+ if header?
84
+ puts "#" + line.chomp + emit_header
85
+ else
86
+ row = row.map {|d| d.to_f}
87
+ binned = row.each_with_index.map {|d,i| bin(d,@options[:bins][i])}
88
+ puts binned.join(@delimiter) + emit
89
+ end
90
+ output += 1
91
+ end
92
+ stats.show(input,output) if @options[:live]
93
+ end
94
+ stats.finish(input,output)
95
+ end
@@ -1,3 +1,3 @@
1
1
  module Mddb
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mddb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-15 00:00:00.000000000Z
12
+ date: 2012-02-16 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70212474145840 !ruby/object:Gem::Requirement
16
+ requirement: &70129250480800 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70212474145840
24
+ version_requirements: *70129250480800
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: mongo_mapper
27
- requirement: &70212474145420 !ruby/object:Gem::Requirement
27
+ requirement: &70129250480340 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70212474145420
35
+ version_requirements: *70129250480340
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: hirb
38
- requirement: &70212474145000 !ruby/object:Gem::Requirement
38
+ requirement: &70129250479880 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70212474145000
46
+ version_requirements: *70129250479880
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: thor
49
- requirement: &70212474144540 !ruby/object:Gem::Requirement
49
+ requirement: &70129250479340 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,11 +54,14 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70212474144540
57
+ version_requirements: *70129250479340
58
58
  description: MDDB makes analysing molecular dynamics simulations easy
59
59
  email:
60
60
  - thom.mulvaney@gmail.com
61
61
  executables:
62
+ - aggregater
63
+ - average
64
+ - binner
62
65
  - mddb
63
66
  extensions: []
64
67
  extra_rdoc_files: []
@@ -66,6 +69,9 @@ files:
66
69
  - .gitignore
67
70
  - Gemfile
68
71
  - Rakefile
72
+ - bin/aggregater
73
+ - bin/average
74
+ - bin/binner
69
75
  - bin/mddb
70
76
  - config.yml
71
77
  - lib/mddb.rb