hashed-diff 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/bin/hashed-diff +126 -0
  3. metadata +63 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6e5f755e4ca0d07b9b744614c92b5f773f493171
4
+ data.tar.gz: 42fcbec2684414a4c118d0b847f091a2f0675838
5
+ SHA512:
6
+ metadata.gz: c6545b833e34266139462e6965e1b9e55894bd4db30acb80c555e1731dcf3c9a83dc35d24de6ee1a53420eff8f4cf0e54fd9756dfc9c7d7f9edede5351f1bed0
7
+ data.tar.gz: 22377f8eb95bdef332579ffd9ea6a2e585e50391037a48454bccdde12d1811f1c5f87dde3a3420784da8e8ca9787667fd26351535e728b1187441484eaf7f80d
data/bin/hashed-diff ADDED
@@ -0,0 +1,126 @@
1
+ #!/usr/bin/env ruby
2
+ require 'xxhash'
3
+ #
4
+ # hashed_diff - Copyright Waag Society 2015 -
5
+ # Taco van Dijk & Lodewijk Loos
6
+ #
7
+ # reasonably fast, memory efficient diff for very large files
8
+ #
9
+ # uses the very fast xxhash to create temporary files with hashes for each individual line
10
+ # applies diff to these hashed files
11
+ # transforms the diff output to include the original lines
12
+ #
13
+ # Improvements:
14
+ # speed doesn't seem to improve using two threads,
15
+ # this because we're hitting io limits when hashing which degrades performance
16
+ #
17
+ exit(0) unless ARGV[0] && ARGV[1]
18
+
19
+ $old_file = ARGV[0]
20
+ $new_file = ARGV[1]
21
+
22
+ #diff regexps
23
+ $command = /(\d*,\d*|\d*)([cda])(\d*,\d*|\d*)/
24
+ $old_line = /< \d*/
25
+ $new_line = /> \d*/
26
+ $divider = /---/
27
+
28
+ # fills the original line arrays
29
+ def process(line)
30
+ match = $command.match(line)
31
+ if(match)
32
+ collect_command(line,match)
33
+ end
34
+ end
35
+
36
+ # output the processed lines
37
+ def output(line)
38
+ match = $command.match(line)
39
+ if(match)
40
+ process_command(line,match)
41
+ end
42
+
43
+ if(match)
44
+ puts line
45
+ elsif($old_line =~ line)
46
+ ln = $old_ln.shift
47
+ puts "< #{$old_index[ln]}"
48
+ elsif($new_line =~ line)
49
+ ln = $new_ln.shift
50
+ puts "> #{$new_index[ln]}"
51
+ elsif($divider =~ line)
52
+ puts line
53
+ end
54
+ end
55
+
56
+ #retrieve array of lines from a file by iterating
57
+ def retrieve_lines(path,lns)
58
+ lines = []
59
+ count = 1
60
+ IO.foreach(path) { |line|
61
+ if(lns.include? count)
62
+ lines << line
63
+ lns.shift
64
+ end
65
+ count += 1
66
+ break if (lns.count == 0)
67
+ }
68
+ return lines
69
+ end
70
+
71
+ #parse the line numbers to output
72
+ def process_command(line,match)
73
+ #get line numbers
74
+ $old_ln = range_to_linenumbers(match[1]).flatten
75
+ $new_ln = range_to_linenumbers(match[3]).flatten
76
+ end
77
+
78
+ #parse the line numbers to retrieve
79
+ def collect_command(line,match)
80
+ #get line numbers
81
+ $old_ln << range_to_linenumbers(match[1])
82
+ $new_ln << range_to_linenumbers(match[3])
83
+ end
84
+
85
+ #convert diff range to array of line numbers
86
+ def range_to_linenumbers(range)
87
+ return [range.to_i] unless range.split(',').length > 1
88
+ rstart, rend = range.split(',')
89
+ ln = (rstart.to_i..rend.to_i).to_a
90
+ end
91
+
92
+ def hash_job(file)
93
+ open("#{file}.hashed", 'a') { |f| IO.foreach(file) { |line| f << "#{XXhash.xxh32(line)}\n"}}
94
+ end
95
+
96
+ #create processes for both files
97
+ hash_job($old_file)
98
+ hash_job($new_file)
99
+
100
+ #diff the two hashed files
101
+ diff = `diff #{$old_file}.hashed #{$new_file}.hashed`
102
+
103
+ #arrays of linenumbers to retrieve for the current command
104
+ $old_ln = []
105
+ $new_ln = []
106
+
107
+ #process the diff line by line
108
+ diff.each_line do |line|
109
+ process line
110
+ end
111
+
112
+ old_lines = retrieve_lines($old_file,$old_ln.flatten)
113
+ new_lines = retrieve_lines($new_file,$new_ln.flatten)
114
+
115
+ # index the lines for future reference
116
+ $old_index = Hash[$old_ln.flatten.zip(old_lines)]
117
+ $new_index = Hash[$new_ln.flatten.zip(new_lines)]
118
+
119
+ # output
120
+ diff.each_line do | line |
121
+ output line
122
+ end
123
+
124
+ #clean up
125
+ `rm #{$old_file}.hashed`
126
+ `rm #{$new_file}.hashed`
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hashed-diff
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Taco van Dijk
8
+ - Lodewijk Loos
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2015-12-07 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: xxhash
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: 0.3.0
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: 0.3.0
28
+ description: The hashed-diff ruby script is a reasonably fast, memory efficient script
29
+ that wraps diff for very large files (4Gb).
30
+ email:
31
+ - taco@waag.org
32
+ - lodewijk@waag.org
33
+ executables:
34
+ - hashed-diff
35
+ extensions: []
36
+ extra_rdoc_files: []
37
+ files:
38
+ - bin/hashed-diff
39
+ homepage: https://github.com/waagsociety/hashed-diff
40
+ licenses:
41
+ - MIT
42
+ metadata: {}
43
+ post_install_message:
44
+ rdoc_options: []
45
+ require_paths:
46
+ - lib
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
52
+ required_rubygems_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ requirements: []
58
+ rubyforge_project:
59
+ rubygems_version: 2.4.5
60
+ signing_key:
61
+ specification_version: 4
62
+ summary: Memory efficient diff wrapper
63
+ test_files: []