piplcollector 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/piplcollector.rb +116 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 3586ce00624a23fabe60ec6135415221723b8ec0
|
4
|
+
data.tar.gz: 41dfc1accf74bc77c0c8a09ad0374f0b622d8592
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 43229a308129b7211ffeb4401fff72ed2a6d46479c300620431de696a675f9679d00975ad4eb506d765593c018fa4b859f82249174baac0ea884e530713363c1
|
7
|
+
data.tar.gz: fcc01c2ad1ec8c8029b9d8cc605de476f481b2784e5adffb47839e3a64ddbaee915a1ff4a477303498d242209f2a5ab85983f3ee8af3f64b0ea1826580b7ee27
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require 'piplrequest'
|
2
|
+
require 'json'
|
3
|
+
require 'pry'
|
4
|
+
|
5
|
+
class PiplCollector
|
6
|
+
def initialize(input_dir, output_dir, output_append_dir, id_field, ignore_files, api_key, field_mapping)
|
7
|
+
@input_dir = input_dir
|
8
|
+
@output_dir = output_dir
|
9
|
+
@output_append_dir = output_append_dir
|
10
|
+
@id_field = id_field
|
11
|
+
@ignore_files = ignore_files
|
12
|
+
@api_key = api_key
|
13
|
+
@field_mapping = field_mapping
|
14
|
+
@already_collected = load_output_files
|
15
|
+
end
|
16
|
+
|
17
|
+
# Load the output files into already_collected
|
18
|
+
def load_output_files
|
19
|
+
collected = []
|
20
|
+
|
21
|
+
# Make a list of all saved files
|
22
|
+
Dir.foreach(@output_dir) do |file|
|
23
|
+
next if file == '.' or file == '..'
|
24
|
+
collected.push(file.gsub(".json", ""))
|
25
|
+
end
|
26
|
+
|
27
|
+
return collected
|
28
|
+
end
|
29
|
+
|
30
|
+
# Save output file
|
31
|
+
def save_output_file(output_item, data_item)
|
32
|
+
id = gen_filename_from_id(data_item)
|
33
|
+
File.write(@output_dir+"/"+id+".json", output_item)
|
34
|
+
@already_collected.push(id)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Generates a file-safe name from the id field
|
38
|
+
def gen_filename_from_id(data_item)
|
39
|
+
data_item[@id_field].gsub(":", "").gsub("/", "").gsub(".", "")
|
40
|
+
end
|
41
|
+
|
42
|
+
# Checks if it is already collected
|
43
|
+
def was_collected?(data_item)
|
44
|
+
@already_collected.include?(gen_filename_from_id(data_item))
|
45
|
+
end
|
46
|
+
|
47
|
+
# Get info on person from pipl
|
48
|
+
def get_person(data_item)
|
49
|
+
sleep(1)
|
50
|
+
|
51
|
+
# Get data from Pipl
|
52
|
+
p = PiplRequest.new(@api_key, @field_mapping)
|
53
|
+
output = p.get_data(data_item)
|
54
|
+
|
55
|
+
# Handle output
|
56
|
+
save_output_file(output, data_item) if output
|
57
|
+
return JSON.parse(output) if output
|
58
|
+
end
|
59
|
+
|
60
|
+
# Gets content for already collected person
|
61
|
+
def get_already_collected_person(data_item)
|
62
|
+
filename = @output_dir+"/"+gen_filename_from_id(data_item)+".json"
|
63
|
+
return file = JSON.parse(File.read(filename))
|
64
|
+
end
|
65
|
+
|
66
|
+
# Process file
|
67
|
+
def process(file)
|
68
|
+
data = JSON.parse(File.read(file))
|
69
|
+
outfile = Array.new
|
70
|
+
|
71
|
+
# Go through each item in file
|
72
|
+
data.each do |item|
|
73
|
+
if !was_collected?(item)
|
74
|
+
item[:pipl] = get_person(item)
|
75
|
+
else
|
76
|
+
item[:pipl] = get_already_collected_person(item)
|
77
|
+
end
|
78
|
+
outfile.push(item)
|
79
|
+
end
|
80
|
+
|
81
|
+
JSON.pretty_generate(outfile)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Create if they don't exist
|
85
|
+
def create_write_dirs(dir)
|
86
|
+
dirs = dir.split("/")
|
87
|
+
dirs.delete("")
|
88
|
+
overallpath = ""
|
89
|
+
dirs.each do |d|
|
90
|
+
Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d)
|
91
|
+
overallpath += ("/"+d)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# Figure out where to write it
|
96
|
+
def get_write_dir(dir, file)
|
97
|
+
dir_save = dir.gsub(@input_dir, @output_append_dir)
|
98
|
+
return dir_save+"/"+file
|
99
|
+
end
|
100
|
+
|
101
|
+
# Run on files
|
102
|
+
def run(dir)
|
103
|
+
Dir.foreach(dir) do |file|
|
104
|
+
next if file == '.' or file == '..'
|
105
|
+
if File.directory?(dir+"/"+file)
|
106
|
+
run(dir+"/"+file)
|
107
|
+
elsif file.include?(".json") && !file.include?(@ignore_files)
|
108
|
+
if !File.exist?(get_write_dir(dir, file))
|
109
|
+
with_pipl = process(dir+"/"+file)
|
110
|
+
create_write_dirs(dir.gsub(@input_dir, @output_append_dir))
|
111
|
+
File.write(get_write_dir(dir, file), with_pipl)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: piplcollector
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- M. C. McGrath
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-01-21 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Gets data from Pipl for dir of files
|
14
|
+
email: shidash@shidash.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/piplcollector.rb
|
20
|
+
homepage: https://github.com/TransparencyToolkit/piplcollector
|
21
|
+
licenses:
|
22
|
+
- GPL
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.4.8
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Gets data from Pipl for dir of files
|
44
|
+
test_files: []
|
45
|
+
has_rdoc:
|