wlsearchscraper 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/wlsearchscraper.rb +67 -5
 - metadata +2 -2
 
    
        data/lib/wlsearchscraper.rb
    CHANGED
    
    | 
         @@ -1,5 +1,6 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            require 'nokogiri'
         
     | 
| 
       2 
2 
     | 
    
         
             
            require 'open-uri'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require 'json'
         
     | 
| 
       3 
4 
     | 
    
         | 
| 
       4 
5 
     | 
    
         
             
            class WLSearchScraper
         
     | 
| 
       5 
6 
     | 
    
         
             
              def initialize(searchterms)
         
     | 
| 
         @@ -7,7 +8,7 @@ class WLSearchScraper 
     | 
|
| 
       7 
8 
     | 
    
         
             
                @resultlist = Array.new
         
     | 
| 
       8 
9 
     | 
    
         
             
              end
         
     | 
| 
       9 
10 
     | 
    
         | 
| 
       10 
     | 
    
         
            -
              # Returns array of document  
     | 
| 
      
 11 
     | 
    
         
            +
              # Returns array of document URLs matching search terms
         
     | 
| 
       11 
12 
     | 
    
         
             
              def scrape
         
     | 
| 
       12 
13 
     | 
    
         
             
                @searchterms.gsub!(" ", "+")
         
     | 
| 
       13 
14 
     | 
    
         
             
                url = "https://search.wikileaks.org/advanced?q=" + @searchterms + "&exclude_words=&words_title_only=&words_content_only=&publication_type[]=3"
         
     | 
| 
         @@ -15,11 +16,72 @@ class WLSearchScraper 
     | 
|
| 
       15 
16 
     | 
    
         | 
| 
       16 
17 
     | 
    
         
             
                html.css("h4").each do |h|
         
     | 
| 
       17 
18 
     | 
    
         
             
                  href = h.css("a")[0]["href"]
         
     | 
| 
       18 
     | 
    
         
            -
                   
     | 
| 
       19 
     | 
    
         
            -
                  cable = split[split.length-1].split("_a.html")
         
     | 
| 
       20 
     | 
    
         
            -
                  @resultlist.push(cable[0])
         
     | 
| 
      
 19 
     | 
    
         
            +
                  @resultlist.push(cableParser(href))
         
     | 
| 
       21 
20 
     | 
    
         
             
                end
         
     | 
| 
       22 
21 
     | 
    
         | 
| 
       23 
     | 
    
         
            -
                return @resultlist
         
     | 
| 
      
 22 
     | 
    
         
            +
                return JSON.pretty_generate(@resultlist)
         
     | 
| 
      
 23 
     | 
    
         
            +
              end
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
              def cableParser(url)
         
     | 
| 
      
 26 
     | 
    
         
            +
                cablehash = Hash.new
         
     | 
| 
      
 27 
     | 
    
         
            +
                html = Nokogiri::HTML(open(url))
         
     | 
| 
      
 28 
     | 
    
         
            +
                
         
     | 
| 
      
 29 
     | 
    
         
            +
                # Go through and get all the metadata and content
         
     | 
| 
      
 30 
     | 
    
         
            +
                html.css("td").each do |t|
         
     | 
| 
      
 31 
     | 
    
         
            +
                  a = t.css("a")
         
     | 
| 
      
 32 
     | 
    
         
            +
                  if !(a.empty?) && (a[0]["title"] == "Date")
         
     | 
| 
      
 33 
     | 
    
         
            +
                    cablehash[:date] = t.css("div[2]").text
         
     | 
| 
      
 34 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Canonical ID")
         
     | 
| 
      
 35 
     | 
    
         
            +
                    cablehash[:id] = t.css("div[2]").text
         
     | 
| 
      
 36 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Original Classification")
         
     | 
| 
      
 37 
     | 
    
         
            +
                    cablehash[:original_classification] = t.css("div[2]").text
         
     | 
| 
      
 38 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Current Classification")
         
     | 
| 
      
 39 
     | 
    
         
            +
                    cablehash[:current_classification] = t.css("div[2]").text
         
     | 
| 
      
 40 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Handling Restrictions")
         
     | 
| 
      
 41 
     | 
    
         
            +
                    cablehash[:handling_restrictions] = t.css("div[2]").text
         
     | 
| 
      
 42 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Character Count")
         
     | 
| 
      
 43 
     | 
    
         
            +
                    cablehash[:character_count] = t.css("div[2]").text
         
     | 
| 
      
 44 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Executive Order")
         
     | 
| 
      
 45 
     | 
    
         
            +
                    cablehash[:executive_order] = t.css("div[2]").text
         
     | 
| 
      
 46 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Locator")
         
     | 
| 
      
 47 
     | 
    
         
            +
                    cablehash[:locator] = t.css("div[2]").text
         
     | 
| 
      
 48 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "TAGS")
         
     | 
| 
      
 49 
     | 
    
         
            +
                    cablehash[:tags] = t.css("div[2]").text
         
     | 
| 
      
 50 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Concepts")
         
     | 
| 
      
 51 
     | 
    
         
            +
                    cablehash[:concepts] = t.css("div[2]").text
         
     | 
| 
      
 52 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Enclosure")
         
     | 
| 
      
 53 
     | 
    
         
            +
                    cablehash[:enclosure] = t.css("div[2]").text
         
     | 
| 
      
 54 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Type")
         
     | 
| 
      
 55 
     | 
    
         
            +
                    cablehash[:type] = t.css("div[2]").text
         
     | 
| 
      
 56 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Office Origin")
         
     | 
| 
      
 57 
     | 
    
         
            +
                    cablehash[:office_origin] = t.css("div[2]").text
         
     | 
| 
      
 58 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Office Action")
         
     | 
| 
      
 59 
     | 
    
         
            +
                    cablehash[:office_action] = t.css("div[2]").text
         
     | 
| 
      
 60 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Archive Status")
         
     | 
| 
      
 61 
     | 
    
         
            +
                    cablehash[:archive_status] = t.css("div[2]").text
         
     | 
| 
      
 62 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "From")
         
     | 
| 
      
 63 
     | 
    
         
            +
                    cablehash[:from] = t.css("div[2]").text
         
     | 
| 
      
 64 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Markings")
         
     | 
| 
      
 65 
     | 
    
         
            +
                    cablehash[:markings] = t.css("div[2]").text
         
     | 
| 
      
 66 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "To")
         
     | 
| 
      
 67 
     | 
    
         
            +
                    cablehash[:to] = t.css("div[2]").text
         
     | 
| 
      
 68 
     | 
    
         
            +
                  elsif !(a.empty?) && (a[0]["title"] == "Linked documents or other documents with the same ID")
         
     | 
| 
      
 69 
     | 
    
         
            +
                    cablehash[:linked_docs] = t.css("div[2]").text
         
     | 
| 
      
 70 
     | 
    
         
            +
                  end
         
     | 
| 
      
 71 
     | 
    
         
            +
                end
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
                # Get cable content
         
     | 
| 
      
 74 
     | 
    
         
            +
                contentcount = 0
         
     | 
| 
      
 75 
     | 
    
         
            +
                html.css("div").each do |d|
         
     | 
| 
      
 76 
     | 
    
         
            +
                 if d["class"] == "text-content"
         
     | 
| 
      
 77 
     | 
    
         
            +
                   contentcount += 1
         
     | 
| 
      
 78 
     | 
    
         
            +
                   if contentcount == 2
         
     | 
| 
      
 79 
     | 
    
         
            +
                     cablehash[:content] = d.text
         
     | 
| 
      
 80 
     | 
    
         
            +
                   end
         
     | 
| 
      
 81 
     | 
    
         
            +
                 end
         
     | 
| 
      
 82 
     | 
    
         
            +
                end
         
     | 
| 
      
 83 
     | 
    
         
            +
                
         
     | 
| 
      
 84 
     | 
    
         
            +
                return cablehash
         
     | 
| 
       24 
85 
     | 
    
         
             
              end
         
     | 
| 
       25 
86 
     | 
    
         
             
            end
         
     | 
| 
      
 87 
     | 
    
         
            +
             
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: wlsearchscraper
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.0.2
         
     | 
| 
       5 
5 
     | 
    
         
             
              prerelease: 
         
     | 
| 
       6 
6 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       7 
7 
     | 
    
         
             
            authors:
         
     | 
| 
         @@ -9,7 +9,7 @@ authors: 
     | 
|
| 
       9 
9 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       10 
10 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       11 
11 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       12 
     | 
    
         
            -
            date: 2014-03- 
     | 
| 
      
 12 
     | 
    
         
            +
            date: 2014-03-28 00:00:00.000000000 Z
         
     | 
| 
       13 
13 
     | 
    
         
             
            dependencies: []
         
     | 
| 
       14 
14 
     | 
    
         
             
            description: Gets a list of documents from the WikiLeaks search that match certain
         
     | 
| 
       15 
15 
     | 
    
         
             
              terms.
         
     |