RubyGems - wlsearchscraper - Versions diffs - 0.0.1 → 0.0.2 - Mend

wlsearchscraper 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

data/lib/wlsearchscraper.rb +67 -5
metadata +2 -2

data/lib/wlsearchscraper.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'nokogiri'
 require 'open-uri'
+require 'json'
 class WLSearchScraper
   def initialize(searchterms)
@@ -7,7 +8,7 @@ class WLSearchScraper
     @resultlist = Array.new
   end
-  # Returns array of document IDs matching search terms
+  # Returns array of document URLs matching search terms
   def scrape
     @searchterms.gsub!(" ", "+")
     url = "https://search.wikileaks.org/advanced?q=" + @searchterms + "&exclude_words=&words_title_only=&words_content_only=&publication_type[]=3"
@@ -15,11 +16,72 @@ class WLSearchScraper
     html.css("h4").each do |h|
       href = h.css("a")[0]["href"]
-      split = href.split("/")
-      cable = split[split.length-1].split("_a.html")
-      @resultlist.push(cable[0])
+      @resultlist.push(cableParser(href))
     end
-    return @resultlist
+    return JSON.pretty_generate(@resultlist)
+  end
+  def cableParser(url)
+    cablehash = Hash.new
+    html = Nokogiri::HTML(open(url))
+    # Go through and get all the metadata and content
+    html.css("td").each do |t|
+      a = t.css("a")
+      if !(a.empty?) && (a[0]["title"] == "Date")
+        cablehash[:date] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Canonical ID")
+        cablehash[:id] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Original Classification")
+        cablehash[:original_classification] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Current Classification")
+        cablehash[:current_classification] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Handling Restrictions")
+        cablehash[:handling_restrictions] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Character Count")
+        cablehash[:character_count] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Executive Order")
+        cablehash[:executive_order] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Locator")
+        cablehash[:locator] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "TAGS")
+        cablehash[:tags] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Concepts")
+        cablehash[:concepts] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Enclosure")
+        cablehash[:enclosure] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Type")
+        cablehash[:type] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Office Origin")
+        cablehash[:office_origin] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Office Action")
+        cablehash[:office_action] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Archive Status")
+        cablehash[:archive_status] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "From")
+        cablehash[:from] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Markings")
+        cablehash[:markings] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "To")
+        cablehash[:to] = t.css("div[2]").text
+      elsif !(a.empty?) && (a[0]["title"] == "Linked documents or other documents with the same ID")
+        cablehash[:linked_docs] = t.css("div[2]").text
+      end
+    end
+    # Get cable content
+    contentcount = 0
+    html.css("div").each do |d|
+     if d["class"] == "text-content"
+       contentcount += 1
+       if contentcount == 2
+         cablehash[:content] = d.text
+       end
+     end
+    end
+    return cablehash
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wlsearchscraper
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-03-27 00:00:00.000000000 Z
+date: 2014-03-28 00:00:00.000000000 Z
 dependencies: []
 description: Gets a list of documents from the WikiLeaks search that match certain
   terms.