acluscraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/acluscraper.rb +60 -0
  2. metadata +47 -0
@@ -0,0 +1,60 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'json'
4
+ require 'uploadconvert'
5
+
6
+ class ACLUScraper
7
+ def initialize(url)
8
+ @url = url
9
+ @casearray = Array.new
10
+ end
11
+
12
+ # Get all the case documents
13
+ def scrapeCase
14
+ html = Nokogiri::HTML(open(@url))
15
+ prevdate = ""
16
+
17
+ html.css("tbody").each do |t|
18
+ t.css("tr").each do |r|
19
+ if !r.css("a").empty?
20
+ dochash = Hash.new
21
+
22
+ # Get date for filing
23
+ if r.css("td")[0].text == "\u00a0"
24
+ dochash[:date] = prevdate
25
+ else
26
+ prevdate = r.css("td")[0].text.to_s
27
+ dochash[:date] = r.css("td")[0].text.to_s
28
+ end
29
+
30
+ a = r.css("a")
31
+ dochash[:title] = a.text
32
+
33
+ # Get URL
34
+ if a[0]["href"].to_s.include? "https://"
35
+ dochash[:url] = a[0]["href"]
36
+ else
37
+ dochash[:url] = "https://www.aclu.org" + a[0]["href"]
38
+ end
39
+
40
+ # Download documents
41
+ `wget #{dochash[:url]}`
42
+ path = dochash[:url].split("/")
43
+ dochash[:path] = path[path.length-1].chomp.strip
44
+
45
+ # Extract metadata and text
46
+ begin
47
+ u = UploadConvert.new(dochash[:path])
48
+ metadata = u.extractMetadataPDF
49
+ metadata.each{|k, v| dochash[k] = v}
50
+ dochash[:text] = u.detectPDFType
51
+ @casearray.push(dochash)
52
+ rescue
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ JSON.pretty_generate(@casearray)
59
+ end
60
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: acluscraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - M. C. McGrath
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-03-19 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Scrapes ACLU court documents then extracts the plaintext and metadata.
15
+ email: shidash@shidash.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/acluscraper.rb
21
+ homepage: https://github.com/Shidash/ACLUScraper
22
+ licenses:
23
+ - GPL
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 1.8.23
43
+ signing_key:
44
+ specification_version: 3
45
+ summary: Scrapes, extracts text, extracts metadata from ACLU court documents
46
+ test_files: []
47
+ has_rdoc: