url_trimmer 0.0.2 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +11 -1
- data/bin/url-differ +9 -0
- data/lib/url_trimmer/differ.rb +19 -0
- data/lib/url_trimmer/version.rb +1 -1
- data/spec/lib/url_trimmer/differ_spec.rb +21 -0
- data/url_trimmer.gemspec +1 -0
- metadata +21 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 04dae9e3b37ea013c12031188024c628f8df916a
|
4
|
+
data.tar.gz: 32d6caa837aeb69f97ab97c2609b6b3a7ac65dc7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b7a84541cf9f49aa591fc59ed6063faf3da67ba53cbb6d74ac97cac31b577829460c04a52e0459163096f3693b30825883e5dd4dd9aef585f89fbfb158668258
|
7
|
+
data.tar.gz: 1ba47225a09ddf0d16c233bc59e7680d70550b6f725a895eb4601f185569c4358862090b4b1087a94c070ad1f1a3ee85e005ca83d261f560908a2eb927a2061c
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -10,7 +10,17 @@ Install it yourself as:
|
|
10
10
|
|
11
11
|
## Usage
|
12
12
|
|
13
|
-
|
13
|
+
### URL Trimmer
|
14
|
+
|
15
|
+
```bash
|
16
|
+
url-trimmer urls1.txt urls2.txt .. urlsN.txt > unique-urls.txt
|
17
|
+
```
|
18
|
+
|
19
|
+
### URL Differ
|
20
|
+
|
21
|
+
```bash
|
22
|
+
url-differ urls1.txt urls2.txt > urls2-unique.txt
|
23
|
+
```
|
14
24
|
|
15
25
|
## Contributing
|
16
26
|
|
data/bin/url-differ
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative "../lib/url_trimmer/differ"
|
4
|
+
|
5
|
+
file1, file2 = ARGV[0], ARGV[1]
|
6
|
+
urls1 = File.readlines(file1).reject { |line| line.empty? }.map(&:chomp)
|
7
|
+
urls2 = File.readlines(file2).reject { |line| line.empty? }.map(&:chomp)
|
8
|
+
unique_urls = URLTrimmer::Differ.between(urls1, urls2)
|
9
|
+
print "#{unique_urls.join("\n")}\n"
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require_relative "../url_trimmer"
|
2
|
+
|
3
|
+
module URLTrimmer
|
4
|
+
class Differ
|
5
|
+
def self.between(urls1, urls2)
|
6
|
+
unique_urls1 = Worker.uniq_by_domain(urls1)
|
7
|
+
unique_urls1.map! { |url| url[Worker::URL_REGEXP, 0] }
|
8
|
+
domains1 = unique_urls1.map { |url| DomainName(url).domain }
|
9
|
+
|
10
|
+
unique_urls2 = Worker.uniq_by_domain(urls2)
|
11
|
+
unique_urls2.reject! do |url|
|
12
|
+
u = url[Worker::URL_REGEXP, 0]
|
13
|
+
domains1.include?(DomainName(u).domain)
|
14
|
+
end
|
15
|
+
unique_urls2.sort!
|
16
|
+
unique_urls2
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/url_trimmer/version.rb
CHANGED
@@ -0,0 +1,21 @@
|
|
1
|
+
require File.expand_path("spec/spec_helper")
|
2
|
+
require File.expand_path("lib/url_trimmer/differ")
|
3
|
+
|
4
|
+
module URLTrimmer
|
5
|
+
describe Differ do
|
6
|
+
describe ".between(urls1, urls2)" do
|
7
|
+
let(:urls1) { %w(http://www.google.com/1 https://www.google.com.ar/2) }
|
8
|
+
let(:urls2) do
|
9
|
+
%w(https://www.google.com/3 http://www.google.com.ar/4 http://www.google.com.br/5 http://www.example.com/6)
|
10
|
+
end
|
11
|
+
|
12
|
+
it "removes URLs from urls2 already present in urls1" do
|
13
|
+
unique_urls = Differ.between(urls1, urls2)
|
14
|
+
|
15
|
+
unique_urls.size.must_equal 2
|
16
|
+
unique_urls.must_include("http://www.google.com.br/5")
|
17
|
+
unique_urls.must_include("http://www.example.com/6")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/url_trimmer.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_trimmer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-07-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: domain_name
|
@@ -52,10 +52,25 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 5.3.5
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 5.3.5
|
55
69
|
description:
|
56
70
|
email:
|
57
71
|
- cristian@box.cristianrasch.com.ar
|
58
72
|
executables:
|
73
|
+
- url-differ
|
59
74
|
- url-trimmer
|
60
75
|
extensions: []
|
61
76
|
extra_rdoc_files: []
|
@@ -66,9 +81,12 @@ files:
|
|
66
81
|
- LICENSE.txt
|
67
82
|
- README.md
|
68
83
|
- Rakefile
|
84
|
+
- bin/url-differ
|
69
85
|
- bin/url-trimmer
|
70
86
|
- lib/url_trimmer.rb
|
87
|
+
- lib/url_trimmer/differ.rb
|
71
88
|
- lib/url_trimmer/version.rb
|
89
|
+
- spec/lib/url_trimmer/differ_spec.rb
|
72
90
|
- spec/lib/url_trimmer_spec.rb
|
73
91
|
- spec/spec_helper.rb
|
74
92
|
- url_trimmer.gemspec
|
@@ -98,5 +116,6 @@ specification_version: 4
|
|
98
116
|
summary: Reads in plain text files with one URL per line and outputs a list of unique
|
99
117
|
URLs by domain
|
100
118
|
test_files:
|
119
|
+
- spec/lib/url_trimmer/differ_spec.rb
|
101
120
|
- spec/lib/url_trimmer_spec.rb
|
102
121
|
- spec/spec_helper.rb
|