wlsearchscraper 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/wlsearchscraper.rb +67 -5
  2. metadata +2 -2
@@ -1,5 +1,6 @@
1
1
  require 'nokogiri'
2
2
  require 'open-uri'
3
+ require 'json'
3
4
 
4
5
  class WLSearchScraper
5
6
  def initialize(searchterms)
@@ -7,7 +8,7 @@ class WLSearchScraper
7
8
  @resultlist = Array.new
8
9
  end
9
10
 
10
- # Returns array of document IDs matching search terms
11
+ # Returns array of document URLs matching search terms
11
12
  def scrape
12
13
  @searchterms.gsub!(" ", "+")
13
14
  url = "https://search.wikileaks.org/advanced?q=" + @searchterms + "&exclude_words=&words_title_only=&words_content_only=&publication_type[]=3"
@@ -15,11 +16,72 @@ class WLSearchScraper
15
16
 
16
17
  html.css("h4").each do |h|
17
18
  href = h.css("a")[0]["href"]
18
- split = href.split("/")
19
- cable = split[split.length-1].split("_a.html")
20
- @resultlist.push(cable[0])
19
+ @resultlist.push(cableParser(href))
21
20
  end
22
21
 
23
- return @resultlist
22
+ return JSON.pretty_generate(@resultlist)
23
+ end
24
+
25
+ def cableParser(url)
26
+ cablehash = Hash.new
27
+ html = Nokogiri::HTML(open(url))
28
+
29
+ # Go through and get all the metadata and content
30
+ html.css("td").each do |t|
31
+ a = t.css("a")
32
+ if !(a.empty?) && (a[0]["title"] == "Date")
33
+ cablehash[:date] = t.css("div[2]").text
34
+ elsif !(a.empty?) && (a[0]["title"] == "Canonical ID")
35
+ cablehash[:id] = t.css("div[2]").text
36
+ elsif !(a.empty?) && (a[0]["title"] == "Original Classification")
37
+ cablehash[:original_classification] = t.css("div[2]").text
38
+ elsif !(a.empty?) && (a[0]["title"] == "Current Classification")
39
+ cablehash[:current_classification] = t.css("div[2]").text
40
+ elsif !(a.empty?) && (a[0]["title"] == "Handling Restrictions")
41
+ cablehash[:handling_restrictions] = t.css("div[2]").text
42
+ elsif !(a.empty?) && (a[0]["title"] == "Character Count")
43
+ cablehash[:character_count] = t.css("div[2]").text
44
+ elsif !(a.empty?) && (a[0]["title"] == "Executive Order")
45
+ cablehash[:executive_order] = t.css("div[2]").text
46
+ elsif !(a.empty?) && (a[0]["title"] == "Locator")
47
+ cablehash[:locator] = t.css("div[2]").text
48
+ elsif !(a.empty?) && (a[0]["title"] == "TAGS")
49
+ cablehash[:tags] = t.css("div[2]").text
50
+ elsif !(a.empty?) && (a[0]["title"] == "Concepts")
51
+ cablehash[:concepts] = t.css("div[2]").text
52
+ elsif !(a.empty?) && (a[0]["title"] == "Enclosure")
53
+ cablehash[:enclosure] = t.css("div[2]").text
54
+ elsif !(a.empty?) && (a[0]["title"] == "Type")
55
+ cablehash[:type] = t.css("div[2]").text
56
+ elsif !(a.empty?) && (a[0]["title"] == "Office Origin")
57
+ cablehash[:office_origin] = t.css("div[2]").text
58
+ elsif !(a.empty?) && (a[0]["title"] == "Office Action")
59
+ cablehash[:office_action] = t.css("div[2]").text
60
+ elsif !(a.empty?) && (a[0]["title"] == "Archive Status")
61
+ cablehash[:archive_status] = t.css("div[2]").text
62
+ elsif !(a.empty?) && (a[0]["title"] == "From")
63
+ cablehash[:from] = t.css("div[2]").text
64
+ elsif !(a.empty?) && (a[0]["title"] == "Markings")
65
+ cablehash[:markings] = t.css("div[2]").text
66
+ elsif !(a.empty?) && (a[0]["title"] == "To")
67
+ cablehash[:to] = t.css("div[2]").text
68
+ elsif !(a.empty?) && (a[0]["title"] == "Linked documents or other documents with the same ID")
69
+ cablehash[:linked_docs] = t.css("div[2]").text
70
+ end
71
+ end
72
+
73
+ # Get cable content
74
+ contentcount = 0
75
+ html.css("div").each do |d|
76
+ if d["class"] == "text-content"
77
+ contentcount += 1
78
+ if contentcount == 2
79
+ cablehash[:content] = d.text
80
+ end
81
+ end
82
+ end
83
+
84
+ return cablehash
24
85
  end
25
86
  end
87
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wlsearchscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-03-27 00:00:00.000000000 Z
12
+ date: 2014-03-28 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Gets a list of documents from the WikiLeaks search that match certain
15
15
  terms.