wlsearchscraper 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/wlsearchscraper.rb +67 -5
  2. metadata +2 -2
@@ -1,5 +1,6 @@
1
1
  require 'nokogiri'
2
2
  require 'open-uri'
3
+ require 'json'
3
4
 
4
5
  class WLSearchScraper
5
6
  def initialize(searchterms)
@@ -7,7 +8,7 @@ class WLSearchScraper
7
8
  @resultlist = Array.new
8
9
  end
9
10
 
10
- # Returns array of document IDs matching search terms
11
+ # Returns array of document URLs matching search terms
11
12
  def scrape
12
13
  @searchterms.gsub!(" ", "+")
13
14
  url = "https://search.wikileaks.org/advanced?q=" + @searchterms + "&exclude_words=&words_title_only=&words_content_only=&publication_type[]=3"
@@ -15,11 +16,72 @@ class WLSearchScraper
15
16
 
16
17
  html.css("h4").each do |h|
17
18
  href = h.css("a")[0]["href"]
18
- split = href.split("/")
19
- cable = split[split.length-1].split("_a.html")
20
- @resultlist.push(cable[0])
19
+ @resultlist.push(cableParser(href))
21
20
  end
22
21
 
23
- return @resultlist
22
+ return JSON.pretty_generate(@resultlist)
23
+ end
24
+
25
+ def cableParser(url)
26
+ cablehash = Hash.new
27
+ html = Nokogiri::HTML(open(url))
28
+
29
+ # Go through and get all the metadata and content
30
+ html.css("td").each do |t|
31
+ a = t.css("a")
32
+ if !(a.empty?) && (a[0]["title"] == "Date")
33
+ cablehash[:date] = t.css("div[2]").text
34
+ elsif !(a.empty?) && (a[0]["title"] == "Canonical ID")
35
+ cablehash[:id] = t.css("div[2]").text
36
+ elsif !(a.empty?) && (a[0]["title"] == "Original Classification")
37
+ cablehash[:original_classification] = t.css("div[2]").text
38
+ elsif !(a.empty?) && (a[0]["title"] == "Current Classification")
39
+ cablehash[:current_classification] = t.css("div[2]").text
40
+ elsif !(a.empty?) && (a[0]["title"] == "Handling Restrictions")
41
+ cablehash[:handling_restrictions] = t.css("div[2]").text
42
+ elsif !(a.empty?) && (a[0]["title"] == "Character Count")
43
+ cablehash[:character_count] = t.css("div[2]").text
44
+ elsif !(a.empty?) && (a[0]["title"] == "Executive Order")
45
+ cablehash[:executive_order] = t.css("div[2]").text
46
+ elsif !(a.empty?) && (a[0]["title"] == "Locator")
47
+ cablehash[:locator] = t.css("div[2]").text
48
+ elsif !(a.empty?) && (a[0]["title"] == "TAGS")
49
+ cablehash[:tags] = t.css("div[2]").text
50
+ elsif !(a.empty?) && (a[0]["title"] == "Concepts")
51
+ cablehash[:concepts] = t.css("div[2]").text
52
+ elsif !(a.empty?) && (a[0]["title"] == "Enclosure")
53
+ cablehash[:enclosure] = t.css("div[2]").text
54
+ elsif !(a.empty?) && (a[0]["title"] == "Type")
55
+ cablehash[:type] = t.css("div[2]").text
56
+ elsif !(a.empty?) && (a[0]["title"] == "Office Origin")
57
+ cablehash[:office_origin] = t.css("div[2]").text
58
+ elsif !(a.empty?) && (a[0]["title"] == "Office Action")
59
+ cablehash[:office_action] = t.css("div[2]").text
60
+ elsif !(a.empty?) && (a[0]["title"] == "Archive Status")
61
+ cablehash[:archive_status] = t.css("div[2]").text
62
+ elsif !(a.empty?) && (a[0]["title"] == "From")
63
+ cablehash[:from] = t.css("div[2]").text
64
+ elsif !(a.empty?) && (a[0]["title"] == "Markings")
65
+ cablehash[:markings] = t.css("div[2]").text
66
+ elsif !(a.empty?) && (a[0]["title"] == "To")
67
+ cablehash[:to] = t.css("div[2]").text
68
+ elsif !(a.empty?) && (a[0]["title"] == "Linked documents or other documents with the same ID")
69
+ cablehash[:linked_docs] = t.css("div[2]").text
70
+ end
71
+ end
72
+
73
+ # Get cable content
74
+ contentcount = 0
75
+ html.css("div").each do |d|
76
+ if d["class"] == "text-content"
77
+ contentcount += 1
78
+ if contentcount == 2
79
+ cablehash[:content] = d.text
80
+ end
81
+ end
82
+ end
83
+
84
+ return cablehash
24
85
  end
25
86
  end
87
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wlsearchscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-03-27 00:00:00.000000000 Z
12
+ date: 2014-03-28 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Gets a list of documents from the WikiLeaks search that match certain
15
15
  terms.