hashed-diff 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/hashed-diff +126 -0
- metadata +63 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 6e5f755e4ca0d07b9b744614c92b5f773f493171
|
4
|
+
data.tar.gz: 42fcbec2684414a4c118d0b847f091a2f0675838
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c6545b833e34266139462e6965e1b9e55894bd4db30acb80c555e1731dcf3c9a83dc35d24de6ee1a53420eff8f4cf0e54fd9756dfc9c7d7f9edede5351f1bed0
|
7
|
+
data.tar.gz: 22377f8eb95bdef332579ffd9ea6a2e585e50391037a48454bccdde12d1811f1c5f87dde3a3420784da8e8ca9787667fd26351535e728b1187441484eaf7f80d
|
data/bin/hashed-diff
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'xxhash'
|
3
|
+
#
|
4
|
+
# hashed_diff - Copyright Waag Society 2015 -
|
5
|
+
# Taco van Dijk & Lodewijk Loos
|
6
|
+
#
|
7
|
+
# reasonably fast, memory efficient diff for very large files
|
8
|
+
#
|
9
|
+
# uses the very fast xxhash to create temporary files with hashes for each individual line
|
10
|
+
# applies diff to these hashed files
|
11
|
+
# transforms the diff output to include the original lines
|
12
|
+
#
|
13
|
+
# Improvements:
|
14
|
+
# speed doesn't seem to improve using two threads,
|
15
|
+
# this because we're hitting io limits when hashing which degrades performance
|
16
|
+
#
|
17
|
+
exit(0) unless ARGV[0] && ARGV[1]
|
18
|
+
|
19
|
+
$old_file = ARGV[0]
|
20
|
+
$new_file = ARGV[1]
|
21
|
+
|
22
|
+
#diff regexps
|
23
|
+
$command = /(\d*,\d*|\d*)([cda])(\d*,\d*|\d*)/
|
24
|
+
$old_line = /< \d*/
|
25
|
+
$new_line = /> \d*/
|
26
|
+
$divider = /---/
|
27
|
+
|
28
|
+
# fills the original line arrays
|
29
|
+
def process(line)
|
30
|
+
match = $command.match(line)
|
31
|
+
if(match)
|
32
|
+
collect_command(line,match)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# output the processed lines
|
37
|
+
def output(line)
|
38
|
+
match = $command.match(line)
|
39
|
+
if(match)
|
40
|
+
process_command(line,match)
|
41
|
+
end
|
42
|
+
|
43
|
+
if(match)
|
44
|
+
puts line
|
45
|
+
elsif($old_line =~ line)
|
46
|
+
ln = $old_ln.shift
|
47
|
+
puts "< #{$old_index[ln]}"
|
48
|
+
elsif($new_line =~ line)
|
49
|
+
ln = $new_ln.shift
|
50
|
+
puts "> #{$new_index[ln]}"
|
51
|
+
elsif($divider =~ line)
|
52
|
+
puts line
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
#retrieve array of lines from a file by iterating
|
57
|
+
def retrieve_lines(path,lns)
|
58
|
+
lines = []
|
59
|
+
count = 1
|
60
|
+
IO.foreach(path) { |line|
|
61
|
+
if(lns.include? count)
|
62
|
+
lines << line
|
63
|
+
lns.shift
|
64
|
+
end
|
65
|
+
count += 1
|
66
|
+
break if (lns.count == 0)
|
67
|
+
}
|
68
|
+
return lines
|
69
|
+
end
|
70
|
+
|
71
|
+
#parse the line numbers to output
|
72
|
+
def process_command(line,match)
|
73
|
+
#get line numbers
|
74
|
+
$old_ln = range_to_linenumbers(match[1]).flatten
|
75
|
+
$new_ln = range_to_linenumbers(match[3]).flatten
|
76
|
+
end
|
77
|
+
|
78
|
+
#parse the line numbers to retrieve
|
79
|
+
def collect_command(line,match)
|
80
|
+
#get line numbers
|
81
|
+
$old_ln << range_to_linenumbers(match[1])
|
82
|
+
$new_ln << range_to_linenumbers(match[3])
|
83
|
+
end
|
84
|
+
|
85
|
+
#convert diff range to array of line numbers
|
86
|
+
def range_to_linenumbers(range)
|
87
|
+
return [range.to_i] unless range.split(',').length > 1
|
88
|
+
rstart, rend = range.split(',')
|
89
|
+
ln = (rstart.to_i..rend.to_i).to_a
|
90
|
+
end
|
91
|
+
|
92
|
+
def hash_job(file)
|
93
|
+
open("#{file}.hashed", 'a') { |f| IO.foreach(file) { |line| f << "#{XXhash.xxh32(line)}\n"}}
|
94
|
+
end
|
95
|
+
|
96
|
+
#create processes for both files
|
97
|
+
hash_job($old_file)
|
98
|
+
hash_job($new_file)
|
99
|
+
|
100
|
+
#diff the two hashed files
|
101
|
+
diff = `diff #{$old_file}.hashed #{$new_file}.hashed`
|
102
|
+
|
103
|
+
#arrays of linenumbers to retrieve for the current command
|
104
|
+
$old_ln = []
|
105
|
+
$new_ln = []
|
106
|
+
|
107
|
+
#process the diff line by line
|
108
|
+
diff.each_line do |line|
|
109
|
+
process line
|
110
|
+
end
|
111
|
+
|
112
|
+
old_lines = retrieve_lines($old_file,$old_ln.flatten)
|
113
|
+
new_lines = retrieve_lines($new_file,$new_ln.flatten)
|
114
|
+
|
115
|
+
# index the lines for future reference
|
116
|
+
$old_index = Hash[$old_ln.flatten.zip(old_lines)]
|
117
|
+
$new_index = Hash[$new_ln.flatten.zip(new_lines)]
|
118
|
+
|
119
|
+
# output
|
120
|
+
diff.each_line do | line |
|
121
|
+
output line
|
122
|
+
end
|
123
|
+
|
124
|
+
#clean up
|
125
|
+
`rm #{$old_file}.hashed`
|
126
|
+
`rm #{$new_file}.hashed`
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hashed-diff
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Taco van Dijk
|
8
|
+
- Lodewijk Loos
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2015-12-07 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: xxhash
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: 0.3.0
|
21
|
+
type: :runtime
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - "~>"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: 0.3.0
|
28
|
+
description: The hashed-diff ruby script is a reasonably fast, memory efficient script
|
29
|
+
that wraps diff for very large files (4Gb).
|
30
|
+
email:
|
31
|
+
- taco@waag.org
|
32
|
+
- lodewijk@waag.org
|
33
|
+
executables:
|
34
|
+
- hashed-diff
|
35
|
+
extensions: []
|
36
|
+
extra_rdoc_files: []
|
37
|
+
files:
|
38
|
+
- bin/hashed-diff
|
39
|
+
homepage: https://github.com/waagsociety/hashed-diff
|
40
|
+
licenses:
|
41
|
+
- MIT
|
42
|
+
metadata: {}
|
43
|
+
post_install_message:
|
44
|
+
rdoc_options: []
|
45
|
+
require_paths:
|
46
|
+
- lib
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
48
|
+
requirements:
|
49
|
+
- - ">="
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: '0'
|
52
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '0'
|
57
|
+
requirements: []
|
58
|
+
rubyforge_project:
|
59
|
+
rubygems_version: 2.4.5
|
60
|
+
signing_key:
|
61
|
+
specification_version: 4
|
62
|
+
summary: Memory efficient diff wrapper
|
63
|
+
test_files: []
|