wlsearchscraper 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/wlsearchscraper.rb +67 -5
- metadata +2 -2
data/lib/wlsearchscraper.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'open-uri'
|
3
|
+
require 'json'
|
3
4
|
|
4
5
|
class WLSearchScraper
|
5
6
|
def initialize(searchterms)
|
@@ -7,7 +8,7 @@ class WLSearchScraper
|
|
7
8
|
@resultlist = Array.new
|
8
9
|
end
|
9
10
|
|
10
|
-
# Returns array of document
|
11
|
+
# Returns array of document URLs matching search terms
|
11
12
|
def scrape
|
12
13
|
@searchterms.gsub!(" ", "+")
|
13
14
|
url = "https://search.wikileaks.org/advanced?q=" + @searchterms + "&exclude_words=&words_title_only=&words_content_only=&publication_type[]=3"
|
@@ -15,11 +16,72 @@ class WLSearchScraper
|
|
15
16
|
|
16
17
|
html.css("h4").each do |h|
|
17
18
|
href = h.css("a")[0]["href"]
|
18
|
-
|
19
|
-
cable = split[split.length-1].split("_a.html")
|
20
|
-
@resultlist.push(cable[0])
|
19
|
+
@resultlist.push(cableParser(href))
|
21
20
|
end
|
22
21
|
|
23
|
-
return @resultlist
|
22
|
+
return JSON.pretty_generate(@resultlist)
|
23
|
+
end
|
24
|
+
|
25
|
+
def cableParser(url)
|
26
|
+
cablehash = Hash.new
|
27
|
+
html = Nokogiri::HTML(open(url))
|
28
|
+
|
29
|
+
# Go through and get all the metadata and content
|
30
|
+
html.css("td").each do |t|
|
31
|
+
a = t.css("a")
|
32
|
+
if !(a.empty?) && (a[0]["title"] == "Date")
|
33
|
+
cablehash[:date] = t.css("div[2]").text
|
34
|
+
elsif !(a.empty?) && (a[0]["title"] == "Canonical ID")
|
35
|
+
cablehash[:id] = t.css("div[2]").text
|
36
|
+
elsif !(a.empty?) && (a[0]["title"] == "Original Classification")
|
37
|
+
cablehash[:original_classification] = t.css("div[2]").text
|
38
|
+
elsif !(a.empty?) && (a[0]["title"] == "Current Classification")
|
39
|
+
cablehash[:current_classification] = t.css("div[2]").text
|
40
|
+
elsif !(a.empty?) && (a[0]["title"] == "Handling Restrictions")
|
41
|
+
cablehash[:handling_restrictions] = t.css("div[2]").text
|
42
|
+
elsif !(a.empty?) && (a[0]["title"] == "Character Count")
|
43
|
+
cablehash[:character_count] = t.css("div[2]").text
|
44
|
+
elsif !(a.empty?) && (a[0]["title"] == "Executive Order")
|
45
|
+
cablehash[:executive_order] = t.css("div[2]").text
|
46
|
+
elsif !(a.empty?) && (a[0]["title"] == "Locator")
|
47
|
+
cablehash[:locator] = t.css("div[2]").text
|
48
|
+
elsif !(a.empty?) && (a[0]["title"] == "TAGS")
|
49
|
+
cablehash[:tags] = t.css("div[2]").text
|
50
|
+
elsif !(a.empty?) && (a[0]["title"] == "Concepts")
|
51
|
+
cablehash[:concepts] = t.css("div[2]").text
|
52
|
+
elsif !(a.empty?) && (a[0]["title"] == "Enclosure")
|
53
|
+
cablehash[:enclosure] = t.css("div[2]").text
|
54
|
+
elsif !(a.empty?) && (a[0]["title"] == "Type")
|
55
|
+
cablehash[:type] = t.css("div[2]").text
|
56
|
+
elsif !(a.empty?) && (a[0]["title"] == "Office Origin")
|
57
|
+
cablehash[:office_origin] = t.css("div[2]").text
|
58
|
+
elsif !(a.empty?) && (a[0]["title"] == "Office Action")
|
59
|
+
cablehash[:office_action] = t.css("div[2]").text
|
60
|
+
elsif !(a.empty?) && (a[0]["title"] == "Archive Status")
|
61
|
+
cablehash[:archive_status] = t.css("div[2]").text
|
62
|
+
elsif !(a.empty?) && (a[0]["title"] == "From")
|
63
|
+
cablehash[:from] = t.css("div[2]").text
|
64
|
+
elsif !(a.empty?) && (a[0]["title"] == "Markings")
|
65
|
+
cablehash[:markings] = t.css("div[2]").text
|
66
|
+
elsif !(a.empty?) && (a[0]["title"] == "To")
|
67
|
+
cablehash[:to] = t.css("div[2]").text
|
68
|
+
elsif !(a.empty?) && (a[0]["title"] == "Linked documents or other documents with the same ID")
|
69
|
+
cablehash[:linked_docs] = t.css("div[2]").text
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Get cable content
|
74
|
+
contentcount = 0
|
75
|
+
html.css("div").each do |d|
|
76
|
+
if d["class"] == "text-content"
|
77
|
+
contentcount += 1
|
78
|
+
if contentcount == 2
|
79
|
+
cablehash[:content] = d.text
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
return cablehash
|
24
85
|
end
|
25
86
|
end
|
87
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wlsearchscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-03-
|
12
|
+
date: 2014-03-28 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Gets a list of documents from the WikiLeaks search that match certain
|
15
15
|
terms.
|