coa-op-scraper 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,22 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem 'nokogiri'
4
+ gem 'rails'
5
+
6
+ group :development, :test do
7
+ gem 'rspec-rails'
8
+ end
9
+
10
+ group :test do
11
+ gem 'timecop'
12
+ gem 'vcr' #, '~> 2.0.rc'
13
+ gem 'fakeweb'
14
+ end
15
+
16
+ # Add dependencies to develop your gem here.
17
+ # Include everything needed to run rake, tests, features, etc.
18
+ group :development do
19
+ gem "rdoc" #, "~> 3.12"
20
+ gem "bundler" #, "~> 1.0.0"
21
+ gem "jeweler" #, "~> 1.8.4"
22
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,116 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ actionmailer (3.2.11)
5
+ actionpack (= 3.2.11)
6
+ mail (~> 2.4.4)
7
+ actionpack (3.2.11)
8
+ activemodel (= 3.2.11)
9
+ activesupport (= 3.2.11)
10
+ builder (~> 3.0.0)
11
+ erubis (~> 2.7.0)
12
+ journey (~> 1.0.4)
13
+ rack (~> 1.4.0)
14
+ rack-cache (~> 1.2)
15
+ rack-test (~> 0.6.1)
16
+ sprockets (~> 2.2.1)
17
+ activemodel (3.2.11)
18
+ activesupport (= 3.2.11)
19
+ builder (~> 3.0.0)
20
+ activerecord (3.2.11)
21
+ activemodel (= 3.2.11)
22
+ activesupport (= 3.2.11)
23
+ arel (~> 3.0.2)
24
+ tzinfo (~> 0.3.29)
25
+ activeresource (3.2.11)
26
+ activemodel (= 3.2.11)
27
+ activesupport (= 3.2.11)
28
+ activesupport (3.2.11)
29
+ i18n (~> 0.6)
30
+ multi_json (~> 1.0)
31
+ arel (3.0.2)
32
+ builder (3.0.4)
33
+ diff-lcs (1.1.3)
34
+ erubis (2.7.0)
35
+ fakeweb (1.3.0)
36
+ git (1.2.5)
37
+ hike (1.2.1)
38
+ i18n (0.6.1)
39
+ jeweler (1.8.4)
40
+ bundler (~> 1.0)
41
+ git (>= 1.2.5)
42
+ rake
43
+ rdoc
44
+ journey (1.0.4)
45
+ json (1.7.6)
46
+ mail (2.4.4)
47
+ i18n (>= 0.4.0)
48
+ mime-types (~> 1.16)
49
+ treetop (~> 1.4.8)
50
+ mime-types (1.19)
51
+ multi_json (1.5.0)
52
+ nokogiri (1.5.6)
53
+ polyglot (0.3.3)
54
+ rack (1.4.4)
55
+ rack-cache (1.2)
56
+ rack (>= 0.4)
57
+ rack-ssl (1.3.2)
58
+ rack
59
+ rack-test (0.6.2)
60
+ rack (>= 1.0)
61
+ rails (3.2.11)
62
+ actionmailer (= 3.2.11)
63
+ actionpack (= 3.2.11)
64
+ activerecord (= 3.2.11)
65
+ activeresource (= 3.2.11)
66
+ activesupport (= 3.2.11)
67
+ bundler (~> 1.0)
68
+ railties (= 3.2.11)
69
+ railties (3.2.11)
70
+ actionpack (= 3.2.11)
71
+ activesupport (= 3.2.11)
72
+ rack-ssl (~> 1.3.2)
73
+ rake (>= 0.8.7)
74
+ rdoc (~> 3.4)
75
+ thor (>= 0.14.6, < 2.0)
76
+ rake (10.0.3)
77
+ rdoc (3.12)
78
+ json (~> 1.4)
79
+ rspec-core (2.12.2)
80
+ rspec-expectations (2.12.1)
81
+ diff-lcs (~> 1.1.3)
82
+ rspec-mocks (2.12.1)
83
+ rspec-rails (2.12.2)
84
+ actionpack (>= 3.0)
85
+ activesupport (>= 3.0)
86
+ railties (>= 3.0)
87
+ rspec-core (~> 2.12.0)
88
+ rspec-expectations (~> 2.12.0)
89
+ rspec-mocks (~> 2.12.0)
90
+ sprockets (2.2.2)
91
+ hike (~> 1.2)
92
+ multi_json (~> 1.0)
93
+ rack (~> 1.0)
94
+ tilt (~> 1.1, != 1.3.0)
95
+ thor (0.16.0)
96
+ tilt (1.3.3)
97
+ timecop (0.5.9)
98
+ treetop (1.4.12)
99
+ polyglot
100
+ polyglot (>= 0.3.1)
101
+ tzinfo (0.3.35)
102
+ vcr (2.4.0)
103
+
104
+ PLATFORMS
105
+ ruby
106
+
107
+ DEPENDENCIES
108
+ bundler
109
+ fakeweb
110
+ jeweler
111
+ nokogiri
112
+ rails
113
+ rdoc
114
+ rspec-rails
115
+ timecop
116
+ vcr
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2013 Don Cruse
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,65 @@
1
+ # Texas COA Op Scraper - a gem for Texas courts of appeals
2
+
3
+ ### What's this about?
4
+
5
+ This gem understands how to parse the opinion lists released by each of
6
+ Texas's fourteen intermediate courts of appeals.
7
+
8
+ Opinion releases are announced on a separate webpage for each court of
9
+ appeals. Some courts use a legacy system; others have shifted to the new
10
+ TAMES system employed by the Texas Supreme Court.
11
+
12
+ ### Why does this gem exist?
13
+
14
+ It was developed as part of the TexApp.org project ([github](http://github.com/texapp)),
15
+ which aims to ensure that Texas's court of appeals opinions are available in
16
+ a reliable &mdash; and citable &mdash; location available to the general public,
17
+ members of the bar, and the court system itself.
18
+
19
+ In Texas, unpublished decisions of intermediate courts of appeals are precedential.
20
+ Yet litigants do not always have a reliable way to locate or cite this authority.
21
+ In the past, it was possible to use a well-crafted Google search to locate
22
+ relevant opinions (a technique discussed in [this 2009 blog post](http://www.scotxblog.com/practice-notes/researching-unpublished-coa-opinions-in-texas/)). But with the courts'
23
+ new TAMES system, these Google searches no longer work. The TAMES system does provide
24
+ many of these opinions in an online archive, but its URLs are prohitively long and
25
+ complex to include in any printed legal brief.
26
+
27
+ ### How can I use this?
28
+
29
+ This gem can be folded into the application of your choice to store
30
+ information about these opinions or queue up downloads of the opinions
31
+ themselves. It does not contain code related to data storage or any
32
+ interface to a file storage service. Those implementation details
33
+ are up to you.
34
+
35
+ The simplest way to use this gem is to specify a particular court of appeals
36
+ (using its two-digit numberical notation, like "03" for the Third Court) and a
37
+ particular date on which you want to check for opinions. The gem will then
38
+ determine the correct URL to use, check that page, and parse what is found to
39
+ retrieve the metadata for each opinion released on that date. What you get back
40
+ is a list of that metadata.
41
+
42
+ The data for each opinion is a simple hash. The overall
43
+ set of results is just an array of those hashes, or an empty array if no
44
+ results were found for that page. Here is an example of the hash for one opinion:
45
+
46
+ > { :author_string => "Opinion by Justice Pemberton",
47
+ :opinion_urls => {"html"=>"/opinions/htmlopinion.asp?OpinionId=20764",
48
+ "pdf"=>"/opinions/PDFOpinion.asp?OpinionId=20764"},
49
+ :disposition => "AFFIRMED:",
50
+ :panel_string => "(Before Chief Justice Jones, Justices Pemberton and Henson)",
51
+ :release_date => Fri, 20 Jan 2012,
52
+ :case_style => "Janeen Denise Smith v. The State of Texas",
53
+ :origin => "Appeal from County Court at Law No. 1 of Caldwell County",
54
+ :docket_no => "03-10-00725-CR",
55
+ :docket_page_url => "/opinions/case.asp?FilingID=15750" }
56
+
57
+ It's up to you to write code that does something interesting with that metadata &mdash;
58
+ such as storing it or downloading the opinion PDFs themselves (as is being done
59
+ for TexApp.org).
60
+
61
+ ## Copyright
62
+
63
+ Copyright (c) 2013 Don Cruse. See LICENSE.txt for
64
+ further details.
65
+
data/Rakefile ADDED
@@ -0,0 +1,31 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "coa-op-scraper"
18
+ gem.homepage = "http://github.com/doncruse/coa-op-scraper"
19
+ gem.license = "(c)2013 Don Cruse"
20
+ gem.summary = "A scraper for intermediate Texas appellate opinions"
21
+ gem.description = "A scraper for intermediate appellate opinions"
22
+ gem.email = "doncruse@gmail.com"
23
+ gem.authors = ["Don Cruse"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rspec/core/rake_task'
29
+ RSpec::Core::RakeTask.new
30
+ task :default => :spec
31
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.2
@@ -0,0 +1,77 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "coa-op-scraper"
8
+ s.version = "0.2.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Don Cruse"]
12
+ s.date = "2013-02-09"
13
+ s.description = "A scraper for intermediate appellate opinions"
14
+ s.email = "doncruse@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ "Gemfile",
23
+ "Gemfile.lock",
24
+ "LICENSE.txt",
25
+ "README.md",
26
+ "Rakefile",
27
+ "VERSION",
28
+ "coa-op-scraper.gemspec",
29
+ "lib/coa_docket_no.rb",
30
+ "lib/coa_op_scraper.rb",
31
+ "lib/legacy.rb",
32
+ "lib/tames.rb",
33
+ "spec/scrapers/legacy_scraper_spec.rb",
34
+ "spec/scrapers/main_spec.rb",
35
+ "spec/scrapers/tames_scraper_spec.rb",
36
+ "spec/spec_helper.rb",
37
+ "spec/support/vcr.rb",
38
+ "spec/vcr/legacy/02-2003-02-20.json",
39
+ "spec/vcr/legacy/03-2013-01-10.json",
40
+ "spec/vcr/tames/01-2012-01-19.json",
41
+ "spec/vcr/tames/12-2003-01-31.json",
42
+ "spec/vcr/tames/14-2005-01-20.json"
43
+ ]
44
+ s.homepage = "http://github.com/doncruse/coa-op-scraper"
45
+ s.licenses = ["(c)2013 Don Cruse"]
46
+ s.require_paths = ["lib"]
47
+ s.rubygems_version = "1.8.23"
48
+ s.summary = "A scraper for intermediate Texas appellate opinions"
49
+
50
+ if s.respond_to? :specification_version then
51
+ s.specification_version = 3
52
+
53
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
54
+ s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
55
+ s.add_runtime_dependency(%q<rails>, [">= 0"])
56
+ s.add_development_dependency(%q<rspec-rails>, [">= 0"])
57
+ s.add_development_dependency(%q<rdoc>, [">= 0"])
58
+ s.add_development_dependency(%q<bundler>, [">= 0"])
59
+ s.add_development_dependency(%q<jeweler>, [">= 0"])
60
+ else
61
+ s.add_dependency(%q<nokogiri>, [">= 0"])
62
+ s.add_dependency(%q<rails>, [">= 0"])
63
+ s.add_dependency(%q<rspec-rails>, [">= 0"])
64
+ s.add_dependency(%q<rdoc>, [">= 0"])
65
+ s.add_dependency(%q<bundler>, [">= 0"])
66
+ s.add_dependency(%q<jeweler>, [">= 0"])
67
+ end
68
+ else
69
+ s.add_dependency(%q<nokogiri>, [">= 0"])
70
+ s.add_dependency(%q<rails>, [">= 0"])
71
+ s.add_dependency(%q<rspec-rails>, [">= 0"])
72
+ s.add_dependency(%q<rdoc>, [">= 0"])
73
+ s.add_dependency(%q<bundler>, [">= 0"])
74
+ s.add_dependency(%q<jeweler>, [">= 0"])
75
+ end
76
+ end
77
+
@@ -0,0 +1,81 @@
1
+ module CoaOpScraper
2
+ class CoaDocketNo
3
+ attr_accessor :no
4
+ # Encapsulating the logic of working with COA docket numbers.
5
+ # Note: Distinct from knowing if a valid docket number was actually used
6
+
7
+ def initialize(no)
8
+ parts = no.split("-")
9
+ if parts.count == 4 and (parts.last == "CR" or parts.last == "CV")
10
+ @no = no
11
+ else
12
+ @no = nil
13
+ end
14
+ end
15
+
16
+ def valid?
17
+ !@no.nil?
18
+ end
19
+
20
+ def to_s
21
+ self.fixed_length
22
+ end
23
+
24
+ def fixed_length
25
+ if self.valid?
26
+ (coa,year,number,type_suffix) = @no.split("-")
27
+ [padded(coa,2), padded(year,2), padded(number,5), type_suffix].join('-')
28
+ else
29
+ ""
30
+ end
31
+ end
32
+
33
+ def without_type
34
+ self.fixed_length.sub("-CR","").sub("-CV","")
35
+ end
36
+
37
+ #####################################
38
+ # For accessing pieces
39
+
40
+ def coa_number
41
+ self.canonical.split("-")[0]
42
+ end
43
+
44
+ def year_number
45
+ self.canonical.split("-")[1]
46
+ end
47
+
48
+ def case_number
49
+ self.canonical.split("-")[2]
50
+ end
51
+
52
+ def civil?
53
+ self.canonical.split("-")[3] == "CV"
54
+ end
55
+
56
+ def criminal?
57
+ self.canonical.split("-")[3] == "CR"
58
+ end
59
+
60
+ ###################################
61
+ # Standardizing how used internally
62
+
63
+ def for_database_key
64
+ self.without_type
65
+ end
66
+ # because the -CV/-CR suffix is not relevant to uniqueness
67
+
68
+ def for_web_urls
69
+ self.fixed_length
70
+ end
71
+
72
+ def canonical
73
+ self.fixed_length
74
+ end
75
+
76
+ protected
77
+ def padded(number,digits)
78
+ sprintf("%0#{digits}d", number)
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,92 @@
1
+ # encoding=utf-8
2
+ module CoaOpScraper
3
+ require 'legacy'
4
+ require 'tames'
5
+ require 'coa_docket_no'
6
+
7
+ require 'date'
8
+ require 'open-uri'
9
+ require 'active_support/core_ext'
10
+
11
+ # The Texas appellate websites are sometimes fragile.
12
+ # These sleep intervals should give ample time between requests.
13
+ HISTORICAL_THROTTLE = 10
14
+ CURRENT_THROTTLE = 5
15
+
16
+ @@check_weekends = FALSE
17
+
18
+ # A court's placement in one of these two hashes tells you about the webpage format
19
+ # currently used by that court.
20
+ TAMES_COAS = [ "01", "03", "04", "05", "06", "09", "11", "12", "14" ]
21
+ LEGACY_COAS = [ "02", "07", "08", "10", "13" ]
22
+
23
+ ############################################################
24
+ # This is the easiest method to use here. Feed it a COA value
25
+ # (in the form "03", for example) and the date for which you
26
+ # want the results (in the form of a Ruby date object).
27
+ #
28
+ def self.scrape_one_opinion_list(coa,target_date)
29
+ doc = self.retrieve_list_for_coa_for_date(coa,target_date)
30
+ if CoaOpScraper::TAMES_COAS[coa]
31
+ CoaOpScraper::Tames.parse_opinion_list(doc)
32
+ elsif CoaOpScraper::LEGACY_COAS[coa]
33
+ CoaOpScraper::Legacy.parse_opinion_list(doc)
34
+ end
35
+ end
36
+
37
+ ############################################################
38
+ # These methods would be useful to populate a queue of opinion
39
+ # lists to check later.
40
+ #
41
+ # The #urls_for_historical_range method will, as expected,
42
+ # compute a list of the URLs that are appropriate (excluding
43
+ # weekends by default).
44
+ #
45
+ # The #parse_coa_opinion_list_at method will take a coa number
46
+ # and a URL and return back a list of the results.
47
+
48
+ def self.urls_for_historical_range(coa, start_date, end_date)
49
+ result = []
50
+ (start_date .. end_date).each do |target_date|
51
+ next unless @@check_weekends or target_date.weekday?
52
+ result << self.url_for_coa_for_date(coa, target_date)
53
+ end
54
+ result
55
+ end # returns an array of URLs
56
+
57
+ def self.parse_coa_opinion_list_at(coa, url)
58
+ self.scrape_one_opinion_list(coa, url) || []
59
+ end # takes a URL, returns a list of the opinion data
60
+
61
+ protected
62
+
63
+ def self.url_for_coa_for_date(coa,date)
64
+ if CoaOpScraper::TAMES_COAS.include?(coa)
65
+ CoaOpScraper::Tames.url_for_coa_for_date(coa,date)
66
+ elsif CoaOpScraper::LEGACY_COAS.include?(coa)
67
+ CoaOpScraper::Legacy.url_for_coa_for_date(coa,date)
68
+ end
69
+ end
70
+
71
+ def self.retrieve_list_for_coa_for_date(coa,date)
72
+ url = self.url_for_coa_for_date(coa,date)
73
+ open(url)
74
+ end
75
+ end
76
+
77
+ # This is required (and helpful) to parse Texas court docket pages
78
+ class String
79
+ def nbsp_strip
80
+ strip.gsub(/\u00a0$/,"").gsub(/^\u00a0/,"").strip
81
+ end # gets rid of some pesky unicode found on Texas OCA sites
82
+
83
+ def strip_both_ends
84
+ nbsp_strip.nbsp_strip.reverse.nbsp_strip.nbsp_strip.reverse
85
+ end
86
+ end
87
+
88
+ class Date
89
+ def weekday?
90
+ !self.saturday? and !self.sunday?
91
+ end
92
+ end