wlsearchscraper 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/wlsearchscraper.rb +67 -5
- metadata +2 -2
data/lib/wlsearchscraper.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'open-uri'
|
3
|
+
require 'json'
|
3
4
|
|
4
5
|
class WLSearchScraper
|
5
6
|
def initialize(searchterms)
|
@@ -7,7 +8,7 @@ class WLSearchScraper
|
|
7
8
|
@resultlist = Array.new
|
8
9
|
end
|
9
10
|
|
10
|
-
# Returns array of document
|
11
|
+
# Returns array of document URLs matching search terms
|
11
12
|
def scrape
|
12
13
|
@searchterms.gsub!(" ", "+")
|
13
14
|
url = "https://search.wikileaks.org/advanced?q=" + @searchterms + "&exclude_words=&words_title_only=&words_content_only=&publication_type[]=3"
|
@@ -15,11 +16,72 @@ class WLSearchScraper
|
|
15
16
|
|
16
17
|
html.css("h4").each do |h|
|
17
18
|
href = h.css("a")[0]["href"]
|
18
|
-
|
19
|
-
cable = split[split.length-1].split("_a.html")
|
20
|
-
@resultlist.push(cable[0])
|
19
|
+
@resultlist.push(cableParser(href))
|
21
20
|
end
|
22
21
|
|
23
|
-
return @resultlist
|
22
|
+
return JSON.pretty_generate(@resultlist)
|
23
|
+
end
|
24
|
+
|
25
|
+
def cableParser(url)
|
26
|
+
cablehash = Hash.new
|
27
|
+
html = Nokogiri::HTML(open(url))
|
28
|
+
|
29
|
+
# Go through and get all the metadata and content
|
30
|
+
html.css("td").each do |t|
|
31
|
+
a = t.css("a")
|
32
|
+
if !(a.empty?) && (a[0]["title"] == "Date")
|
33
|
+
cablehash[:date] = t.css("div[2]").text
|
34
|
+
elsif !(a.empty?) && (a[0]["title"] == "Canonical ID")
|
35
|
+
cablehash[:id] = t.css("div[2]").text
|
36
|
+
elsif !(a.empty?) && (a[0]["title"] == "Original Classification")
|
37
|
+
cablehash[:original_classification] = t.css("div[2]").text
|
38
|
+
elsif !(a.empty?) && (a[0]["title"] == "Current Classification")
|
39
|
+
cablehash[:current_classification] = t.css("div[2]").text
|
40
|
+
elsif !(a.empty?) && (a[0]["title"] == "Handling Restrictions")
|
41
|
+
cablehash[:handling_restrictions] = t.css("div[2]").text
|
42
|
+
elsif !(a.empty?) && (a[0]["title"] == "Character Count")
|
43
|
+
cablehash[:character_count] = t.css("div[2]").text
|
44
|
+
elsif !(a.empty?) && (a[0]["title"] == "Executive Order")
|
45
|
+
cablehash[:executive_order] = t.css("div[2]").text
|
46
|
+
elsif !(a.empty?) && (a[0]["title"] == "Locator")
|
47
|
+
cablehash[:locator] = t.css("div[2]").text
|
48
|
+
elsif !(a.empty?) && (a[0]["title"] == "TAGS")
|
49
|
+
cablehash[:tags] = t.css("div[2]").text
|
50
|
+
elsif !(a.empty?) && (a[0]["title"] == "Concepts")
|
51
|
+
cablehash[:concepts] = t.css("div[2]").text
|
52
|
+
elsif !(a.empty?) && (a[0]["title"] == "Enclosure")
|
53
|
+
cablehash[:enclosure] = t.css("div[2]").text
|
54
|
+
elsif !(a.empty?) && (a[0]["title"] == "Type")
|
55
|
+
cablehash[:type] = t.css("div[2]").text
|
56
|
+
elsif !(a.empty?) && (a[0]["title"] == "Office Origin")
|
57
|
+
cablehash[:office_origin] = t.css("div[2]").text
|
58
|
+
elsif !(a.empty?) && (a[0]["title"] == "Office Action")
|
59
|
+
cablehash[:office_action] = t.css("div[2]").text
|
60
|
+
elsif !(a.empty?) && (a[0]["title"] == "Archive Status")
|
61
|
+
cablehash[:archive_status] = t.css("div[2]").text
|
62
|
+
elsif !(a.empty?) && (a[0]["title"] == "From")
|
63
|
+
cablehash[:from] = t.css("div[2]").text
|
64
|
+
elsif !(a.empty?) && (a[0]["title"] == "Markings")
|
65
|
+
cablehash[:markings] = t.css("div[2]").text
|
66
|
+
elsif !(a.empty?) && (a[0]["title"] == "To")
|
67
|
+
cablehash[:to] = t.css("div[2]").text
|
68
|
+
elsif !(a.empty?) && (a[0]["title"] == "Linked documents or other documents with the same ID")
|
69
|
+
cablehash[:linked_docs] = t.css("div[2]").text
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Get cable content
|
74
|
+
contentcount = 0
|
75
|
+
html.css("div").each do |d|
|
76
|
+
if d["class"] == "text-content"
|
77
|
+
contentcount += 1
|
78
|
+
if contentcount == 2
|
79
|
+
cablehash[:content] = d.text
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
return cablehash
|
24
85
|
end
|
25
86
|
end
|
87
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wlsearchscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-03-
|
12
|
+
date: 2014-03-28 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Gets a list of documents from the WikiLeaks search that match certain
|
15
15
|
terms.
|