acluscraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/acluscraper.rb +60 -0
  2. metadata +47 -0
@@ -0,0 +1,60 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'json'
4
+ require 'uploadconvert'
5
+
6
+ class ACLUScraper
7
+ def initialize(url)
8
+ @url = url
9
+ @casearray = Array.new
10
+ end
11
+
12
+ # Get all the case documents
13
+ def scrapeCase
14
+ html = Nokogiri::HTML(open(@url))
15
+ prevdate = ""
16
+
17
+ html.css("tbody").each do |t|
18
+ t.css("tr").each do |r|
19
+ if !r.css("a").empty?
20
+ dochash = Hash.new
21
+
22
+ # Get date for filing
23
+ if r.css("td")[0].text == "\u00a0"
24
+ dochash[:date] = prevdate
25
+ else
26
+ prevdate = r.css("td")[0].text.to_s
27
+ dochash[:date] = r.css("td")[0].text.to_s
28
+ end
29
+
30
+ a = r.css("a")
31
+ dochash[:title] = a.text
32
+
33
+ # Get URL
34
+ if a[0]["href"].to_s.include? "https://"
35
+ dochash[:url] = a[0]["href"]
36
+ else
37
+ dochash[:url] = "https://www.aclu.org" + a[0]["href"]
38
+ end
39
+
40
+ # Download documents
41
+ `wget #{dochash[:url]}`
42
+ path = dochash[:url].split("/")
43
+ dochash[:path] = path[path.length-1].chomp.strip
44
+
45
+ # Extract metadata and text
46
+ begin
47
+ u = UploadConvert.new(dochash[:path])
48
+ metadata = u.extractMetadataPDF
49
+ metadata.each{|k, v| dochash[k] = v}
50
+ dochash[:text] = u.detectPDFType
51
+ @casearray.push(dochash)
52
+ rescue
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ JSON.pretty_generate(@casearray)
59
+ end
60
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: acluscraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - M. C. McGrath
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-03-19 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Scrapes ACLU court documents then extracts the plaintext and metadata.
15
+ email: shidash@shidash.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/acluscraper.rb
21
+ homepage: https://github.com/Shidash/ACLUScraper
22
+ licenses:
23
+ - GPL
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 1.8.23
43
+ signing_key:
44
+ specification_version: 3
45
+ summary: Scrapes, extracts text, extracts metadata from ACLU court documents
46
+ test_files: []
47
+ has_rdoc: