coa-op-scraper 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,22 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem 'nokogiri'
4
+ gem 'rails'
5
+
6
+ group :development, :test do
7
+ gem 'rspec-rails'
8
+ end
9
+
10
+ group :test do
11
+ gem 'timecop'
12
+ gem 'vcr' #, '~> 2.0.rc'
13
+ gem 'fakeweb'
14
+ end
15
+
16
+ # Add dependencies to develop your gem here.
17
+ # Include everything needed to run rake, tests, features, etc.
18
+ group :development do
19
+ gem "rdoc" #, "~> 3.12"
20
+ gem "bundler" #, "~> 1.0.0"
21
+ gem "jeweler" #, "~> 1.8.4"
22
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,116 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ actionmailer (3.2.11)
5
+ actionpack (= 3.2.11)
6
+ mail (~> 2.4.4)
7
+ actionpack (3.2.11)
8
+ activemodel (= 3.2.11)
9
+ activesupport (= 3.2.11)
10
+ builder (~> 3.0.0)
11
+ erubis (~> 2.7.0)
12
+ journey (~> 1.0.4)
13
+ rack (~> 1.4.0)
14
+ rack-cache (~> 1.2)
15
+ rack-test (~> 0.6.1)
16
+ sprockets (~> 2.2.1)
17
+ activemodel (3.2.11)
18
+ activesupport (= 3.2.11)
19
+ builder (~> 3.0.0)
20
+ activerecord (3.2.11)
21
+ activemodel (= 3.2.11)
22
+ activesupport (= 3.2.11)
23
+ arel (~> 3.0.2)
24
+ tzinfo (~> 0.3.29)
25
+ activeresource (3.2.11)
26
+ activemodel (= 3.2.11)
27
+ activesupport (= 3.2.11)
28
+ activesupport (3.2.11)
29
+ i18n (~> 0.6)
30
+ multi_json (~> 1.0)
31
+ arel (3.0.2)
32
+ builder (3.0.4)
33
+ diff-lcs (1.1.3)
34
+ erubis (2.7.0)
35
+ fakeweb (1.3.0)
36
+ git (1.2.5)
37
+ hike (1.2.1)
38
+ i18n (0.6.1)
39
+ jeweler (1.8.4)
40
+ bundler (~> 1.0)
41
+ git (>= 1.2.5)
42
+ rake
43
+ rdoc
44
+ journey (1.0.4)
45
+ json (1.7.6)
46
+ mail (2.4.4)
47
+ i18n (>= 0.4.0)
48
+ mime-types (~> 1.16)
49
+ treetop (~> 1.4.8)
50
+ mime-types (1.19)
51
+ multi_json (1.5.0)
52
+ nokogiri (1.5.6)
53
+ polyglot (0.3.3)
54
+ rack (1.4.4)
55
+ rack-cache (1.2)
56
+ rack (>= 0.4)
57
+ rack-ssl (1.3.2)
58
+ rack
59
+ rack-test (0.6.2)
60
+ rack (>= 1.0)
61
+ rails (3.2.11)
62
+ actionmailer (= 3.2.11)
63
+ actionpack (= 3.2.11)
64
+ activerecord (= 3.2.11)
65
+ activeresource (= 3.2.11)
66
+ activesupport (= 3.2.11)
67
+ bundler (~> 1.0)
68
+ railties (= 3.2.11)
69
+ railties (3.2.11)
70
+ actionpack (= 3.2.11)
71
+ activesupport (= 3.2.11)
72
+ rack-ssl (~> 1.3.2)
73
+ rake (>= 0.8.7)
74
+ rdoc (~> 3.4)
75
+ thor (>= 0.14.6, < 2.0)
76
+ rake (10.0.3)
77
+ rdoc (3.12)
78
+ json (~> 1.4)
79
+ rspec-core (2.12.2)
80
+ rspec-expectations (2.12.1)
81
+ diff-lcs (~> 1.1.3)
82
+ rspec-mocks (2.12.1)
83
+ rspec-rails (2.12.2)
84
+ actionpack (>= 3.0)
85
+ activesupport (>= 3.0)
86
+ railties (>= 3.0)
87
+ rspec-core (~> 2.12.0)
88
+ rspec-expectations (~> 2.12.0)
89
+ rspec-mocks (~> 2.12.0)
90
+ sprockets (2.2.2)
91
+ hike (~> 1.2)
92
+ multi_json (~> 1.0)
93
+ rack (~> 1.0)
94
+ tilt (~> 1.1, != 1.3.0)
95
+ thor (0.16.0)
96
+ tilt (1.3.3)
97
+ timecop (0.5.9)
98
+ treetop (1.4.12)
99
+ polyglot
100
+ polyglot (>= 0.3.1)
101
+ tzinfo (0.3.35)
102
+ vcr (2.4.0)
103
+
104
+ PLATFORMS
105
+ ruby
106
+
107
+ DEPENDENCIES
108
+ bundler
109
+ fakeweb
110
+ jeweler
111
+ nokogiri
112
+ rails
113
+ rdoc
114
+ rspec-rails
115
+ timecop
116
+ vcr
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2013 Don Cruse
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,65 @@
1
+ # Texas COA Op Scraper - a gem for Texas courts of appeals
2
+
3
+ ### What's this about?
4
+
5
+ This gem understands how to parse the opinion lists released by each of
6
+ Texas's fourteen intermediate courts of appeals.
7
+
8
+ Opinion releases are announced on a separate webpage for each court of
9
+ appeals. Some courts use a legacy system; others have shifted to the new
10
+ TAMES system employed by the Texas Supreme Court.
11
+
12
+ ### Why does this gem exist?
13
+
14
+ It was developed as part of the TexApp.org project ([github](http://github.com/texapp)),
15
+ which aims to ensure that Texas's court of appeals opinions are available in
16
+ a reliable &mdash; and citable &mdash; location available to the general public,
17
+ members of the bar, and the court system itself.
18
+
19
+ In Texas, unpublished decisions of intermediate courts of appeals are precedential.
20
+ Yet litigants do not always have a reliable way to locate or cite this authority.
21
+ In the past, it was possible to use a well-crafted Google search to locate
22
+ relevant opinions (a technique discussed in [this 2009 blog post](http://www.scotxblog.com/practice-notes/researching-unpublished-coa-opinions-in-texas/)). But with the courts'
23
+ new TAMES system, these Google searches no longer work. The TAMES system does provide
24
+ many of these opinions in an online archive, but its URLs are prohitively long and
25
+ complex to include in any printed legal brief.
26
+
27
+ ### How can I use this?
28
+
29
+ This gem can be folded into the application of your choice to store
30
+ information about these opinions or queue up downloads of the opinions
31
+ themselves. It does not contain code related to data storage or any
32
+ interface to a file storage service. Those implementation details
33
+ are up to you.
34
+
35
+ The simplest way to use this gem is to specify a particular court of appeals
36
+ (using its two-digit numberical notation, like "03" for the Third Court) and a
37
+ particular date on which you want to check for opinions. The gem will then
38
+ determine the correct URL to use, check that page, and parse what is found to
39
+ retrieve the metadata for each opinion released on that date. What you get back
40
+ is a list of that metadata.
41
+
42
+ The data for each opinion is a simple hash. The overall
43
+ set of results is just an array of those hashes, or an empty array if no
44
+ results were found for that page. Here is an example of the hash for one opinion:
45
+
46
+ > { :author_string => "Opinion by Justice Pemberton",
47
+ :opinion_urls => {"html"=>"/opinions/htmlopinion.asp?OpinionId=20764",
48
+ "pdf"=>"/opinions/PDFOpinion.asp?OpinionId=20764"},
49
+ :disposition => "AFFIRMED:",
50
+ :panel_string => "(Before Chief Justice Jones, Justices Pemberton and Henson)",
51
+ :release_date => Fri, 20 Jan 2012,
52
+ :case_style => "Janeen Denise Smith v. The State of Texas",
53
+ :origin => "Appeal from County Court at Law No. 1 of Caldwell County",
54
+ :docket_no => "03-10-00725-CR",
55
+ :docket_page_url => "/opinions/case.asp?FilingID=15750" }
56
+
57
+ It's up to you to write code that does something interesting with that metadata &mdash;
58
+ such as storing it or downloading the opinion PDFs themselves (as is being done
59
+ for TexApp.org).
60
+
61
+ ## Copyright
62
+
63
+ Copyright (c) 2013 Don Cruse. See LICENSE.txt for
64
+ further details.
65
+
data/Rakefile ADDED
@@ -0,0 +1,31 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "coa-op-scraper"
18
+ gem.homepage = "http://github.com/doncruse/coa-op-scraper"
19
+ gem.license = "(c)2013 Don Cruse"
20
+ gem.summary = "A scraper for intermediate Texas appellate opinions"
21
+ gem.description = "A scraper for intermediate appellate opinions"
22
+ gem.email = "doncruse@gmail.com"
23
+ gem.authors = ["Don Cruse"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rspec/core/rake_task'
29
+ RSpec::Core::RakeTask.new
30
+ task :default => :spec
31
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.2
@@ -0,0 +1,77 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "coa-op-scraper"
8
+ s.version = "0.2.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Don Cruse"]
12
+ s.date = "2013-02-09"
13
+ s.description = "A scraper for intermediate appellate opinions"
14
+ s.email = "doncruse@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ "Gemfile",
23
+ "Gemfile.lock",
24
+ "LICENSE.txt",
25
+ "README.md",
26
+ "Rakefile",
27
+ "VERSION",
28
+ "coa-op-scraper.gemspec",
29
+ "lib/coa_docket_no.rb",
30
+ "lib/coa_op_scraper.rb",
31
+ "lib/legacy.rb",
32
+ "lib/tames.rb",
33
+ "spec/scrapers/legacy_scraper_spec.rb",
34
+ "spec/scrapers/main_spec.rb",
35
+ "spec/scrapers/tames_scraper_spec.rb",
36
+ "spec/spec_helper.rb",
37
+ "spec/support/vcr.rb",
38
+ "spec/vcr/legacy/02-2003-02-20.json",
39
+ "spec/vcr/legacy/03-2013-01-10.json",
40
+ "spec/vcr/tames/01-2012-01-19.json",
41
+ "spec/vcr/tames/12-2003-01-31.json",
42
+ "spec/vcr/tames/14-2005-01-20.json"
43
+ ]
44
+ s.homepage = "http://github.com/doncruse/coa-op-scraper"
45
+ s.licenses = ["(c)2013 Don Cruse"]
46
+ s.require_paths = ["lib"]
47
+ s.rubygems_version = "1.8.23"
48
+ s.summary = "A scraper for intermediate Texas appellate opinions"
49
+
50
+ if s.respond_to? :specification_version then
51
+ s.specification_version = 3
52
+
53
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
54
+ s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
55
+ s.add_runtime_dependency(%q<rails>, [">= 0"])
56
+ s.add_development_dependency(%q<rspec-rails>, [">= 0"])
57
+ s.add_development_dependency(%q<rdoc>, [">= 0"])
58
+ s.add_development_dependency(%q<bundler>, [">= 0"])
59
+ s.add_development_dependency(%q<jeweler>, [">= 0"])
60
+ else
61
+ s.add_dependency(%q<nokogiri>, [">= 0"])
62
+ s.add_dependency(%q<rails>, [">= 0"])
63
+ s.add_dependency(%q<rspec-rails>, [">= 0"])
64
+ s.add_dependency(%q<rdoc>, [">= 0"])
65
+ s.add_dependency(%q<bundler>, [">= 0"])
66
+ s.add_dependency(%q<jeweler>, [">= 0"])
67
+ end
68
+ else
69
+ s.add_dependency(%q<nokogiri>, [">= 0"])
70
+ s.add_dependency(%q<rails>, [">= 0"])
71
+ s.add_dependency(%q<rspec-rails>, [">= 0"])
72
+ s.add_dependency(%q<rdoc>, [">= 0"])
73
+ s.add_dependency(%q<bundler>, [">= 0"])
74
+ s.add_dependency(%q<jeweler>, [">= 0"])
75
+ end
76
+ end
77
+
@@ -0,0 +1,81 @@
1
+ module CoaOpScraper
2
+ class CoaDocketNo
3
+ attr_accessor :no
4
+ # Encapsulating the logic of working with COA docket numbers.
5
+ # Note: Distinct from knowing if a valid docket number was actually used
6
+
7
+ def initialize(no)
8
+ parts = no.split("-")
9
+ if parts.count == 4 and (parts.last == "CR" or parts.last == "CV")
10
+ @no = no
11
+ else
12
+ @no = nil
13
+ end
14
+ end
15
+
16
+ def valid?
17
+ !@no.nil?
18
+ end
19
+
20
+ def to_s
21
+ self.fixed_length
22
+ end
23
+
24
+ def fixed_length
25
+ if self.valid?
26
+ (coa,year,number,type_suffix) = @no.split("-")
27
+ [padded(coa,2), padded(year,2), padded(number,5), type_suffix].join('-')
28
+ else
29
+ ""
30
+ end
31
+ end
32
+
33
+ def without_type
34
+ self.fixed_length.sub("-CR","").sub("-CV","")
35
+ end
36
+
37
+ #####################################
38
+ # For accessing pieces
39
+
40
+ def coa_number
41
+ self.canonical.split("-")[0]
42
+ end
43
+
44
+ def year_number
45
+ self.canonical.split("-")[1]
46
+ end
47
+
48
+ def case_number
49
+ self.canonical.split("-")[2]
50
+ end
51
+
52
+ def civil?
53
+ self.canonical.split("-")[3] == "CV"
54
+ end
55
+
56
+ def criminal?
57
+ self.canonical.split("-")[3] == "CR"
58
+ end
59
+
60
+ ###################################
61
+ # Standardizing how used internally
62
+
63
+ def for_database_key
64
+ self.without_type
65
+ end
66
+ # because the -CV/-CR suffix is not relevant to uniqueness
67
+
68
+ def for_web_urls
69
+ self.fixed_length
70
+ end
71
+
72
+ def canonical
73
+ self.fixed_length
74
+ end
75
+
76
+ protected
77
+ def padded(number,digits)
78
+ sprintf("%0#{digits}d", number)
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,92 @@
1
+ # encoding=utf-8
2
+ module CoaOpScraper
3
+ require 'legacy'
4
+ require 'tames'
5
+ require 'coa_docket_no'
6
+
7
+ require 'date'
8
+ require 'open-uri'
9
+ require 'active_support/core_ext'
10
+
11
+ # The Texas appellate websites are sometimes fragile.
12
+ # These sleep intervals should give ample time between requests.
13
+ HISTORICAL_THROTTLE = 10
14
+ CURRENT_THROTTLE = 5
15
+
16
+ @@check_weekends = FALSE
17
+
18
+ # A court's placement in one of these two hashes tells you about the webpage format
19
+ # currently used by that court.
20
+ TAMES_COAS = [ "01", "03", "04", "05", "06", "09", "11", "12", "14" ]
21
+ LEGACY_COAS = [ "02", "07", "08", "10", "13" ]
22
+
23
+ ############################################################
24
+ # This is the easiest method to use here. Feed it a COA value
25
+ # (in the form "03", for example) and the date for which you
26
+ # want the results (in the form of a Ruby date object).
27
+ #
28
+ def self.scrape_one_opinion_list(coa,target_date)
29
+ doc = self.retrieve_list_for_coa_for_date(coa,target_date)
30
+ if CoaOpScraper::TAMES_COAS[coa]
31
+ CoaOpScraper::Tames.parse_opinion_list(doc)
32
+ elsif CoaOpScraper::LEGACY_COAS[coa]
33
+ CoaOpScraper::Legacy.parse_opinion_list(doc)
34
+ end
35
+ end
36
+
37
+ ############################################################
38
+ # These methods would be useful to populate a queue of opinion
39
+ # lists to check later.
40
+ #
41
+ # The #urls_for_historical_range method will, as expected,
42
+ # compute a list of the URLs that are appropriate (excluding
43
+ # weekends by default).
44
+ #
45
+ # The #parse_coa_opinion_list_at method will take a coa number
46
+ # and a URL and return back a list of the results.
47
+
48
+ def self.urls_for_historical_range(coa, start_date, end_date)
49
+ result = []
50
+ (start_date .. end_date).each do |target_date|
51
+ next unless @@check_weekends or target_date.weekday?
52
+ result << self.url_for_coa_for_date(coa, target_date)
53
+ end
54
+ result
55
+ end # returns an array of URLs
56
+
57
+ def self.parse_coa_opinion_list_at(coa, url)
58
+ self.scrape_one_opinion_list(coa, url) || []
59
+ end # takes a URL, returns a list of the opinion data
60
+
61
+ protected
62
+
63
+ def self.url_for_coa_for_date(coa,date)
64
+ if CoaOpScraper::TAMES_COAS.include?(coa)
65
+ CoaOpScraper::Tames.url_for_coa_for_date(coa,date)
66
+ elsif CoaOpScraper::LEGACY_COAS.include?(coa)
67
+ CoaOpScraper::Legacy.url_for_coa_for_date(coa,date)
68
+ end
69
+ end
70
+
71
+ def self.retrieve_list_for_coa_for_date(coa,date)
72
+ url = self.url_for_coa_for_date(coa,date)
73
+ open(url)
74
+ end
75
+ end
76
+
77
+ # This is required (and helpful) to parse Texas court docket pages
78
+ class String
79
+ def nbsp_strip
80
+ strip.gsub(/\u00a0$/,"").gsub(/^\u00a0/,"").strip
81
+ end # gets rid of some pesky unicode found on Texas OCA sites
82
+
83
+ def strip_both_ends
84
+ nbsp_strip.nbsp_strip.reverse.nbsp_strip.nbsp_strip.reverse
85
+ end
86
+ end
87
+
88
+ class Date
89
+ def weekday?
90
+ !self.saturday? and !self.sunday?
91
+ end
92
+ end