case_scraper 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/lib/case_scraper.rb +78 -0
- metadata +59 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MTY1NmQ1OTYxNDI4ODdhZTgxYjBlYzkyY2FkODRjZmNmNTA1MTRkYQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MDMxNDkwYzUzNThkOTdlY2VjMjg5ZTVjYTA3Y2ZiMmJhMzhiODhhOQ==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
YmQ0MzAxYzhkMzVlNjQzYzc2MzBjYTBiMmVmZmI0MmYzMDhlYzA2MzY4NmQ5
|
10
|
+
MTdhYTc0OWRkM2QyZDllN2ZjNjkxYmRmNTRiM2Y0NmMwNWI0YTI2NmM1ZmI3
|
11
|
+
YzM2NzNkNWU2Y2Q2ODgwZjc1Y2FkNmEyNTI5N2M0ODlhMzNmZDk=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
ZWE0YmQ0MGViZTdmYTg0NThhNTZkNDg2NzJhMTRlNmY3MGVjY2ZkNjU3M2Rm
|
14
|
+
MDUyMmUyZWZlZWQ1NDNjYjA5ZTYwMzUzYWEwOGYzMDYzZjhmMjA0ZTE2MjE2
|
15
|
+
N2YxMWNhMDNhMTY3ZGFkZmQ2NGRkOWMxYWFkNzNiMDcxMTFiN2U=
|
data/lib/case_scraper.rb
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# case_scraper
|
2
|
+
#
|
3
|
+
# This script uses the Yomu library to extract the text from a legal brief in RTF format. It then uses regular expressions to search the document for case citations and extract the volume, reporter, and starting page for each case. The case citations are then output to a text file, with one citation per line.
|
4
|
+
#
|
5
|
+
# This is an experimental script. Use at your own risk and please be sure to check the output against the list of cases in the original brief to ensure you have a complete list.
|
6
|
+
#
|
7
|
+
# THIS SCRIPT ONLY FOCUSES ON REPORTED U.S. STATE AND FEDERAL CASELAW AND UNREPORTED U.S. STATE AND FEDERAL CASELAW FOR WHICH A LEXIS OR WESTLAW CITATION IS AVAILABLE. IT IS NOT DESIGNED TO DETECT CITATIONS TO CASELAW FROM ANY OTHER JURISDICTION.
|
8
|
+
#
|
9
|
+
# THIS SCRIPT ONLY EXTRACTS CASE CITATIONS, NOT CITATIONS FOR STATUTES OR OTHER AUTHORITIES.
|
10
|
+
#
|
11
|
+
# Author: Greg Kochansky
|
12
|
+
# E-mail: greg@greg-k.com
|
13
|
+
# Website: www.greg-k.com
|
14
|
+
# License: MIT
|
15
|
+
|
16
|
+
# Require the external Yomu library.
|
17
|
+
|
18
|
+
require 'yomu'
|
19
|
+
|
20
|
+
# Test that input and output files are specified.
|
21
|
+
|
22
|
+
unless ARGV.length == 2
|
23
|
+
puts "You must include an input filename (RTF format) and an output filename (TXT format)."
|
24
|
+
puts "Usage: ruby MyScript.rb InputFile.rtf OutputFile.txt\n"
|
25
|
+
exit
|
26
|
+
end
|
27
|
+
|
28
|
+
# Set the two command line arguments as variables.
|
29
|
+
|
30
|
+
input = ARGV[0]
|
31
|
+
output = ARGV[1]
|
32
|
+
|
33
|
+
# Import body text from the input RTF file.
|
34
|
+
|
35
|
+
data = File.read input
|
36
|
+
text = Yomu.read :text, data
|
37
|
+
|
38
|
+
# Test whether output file already exists.
|
39
|
+
#
|
40
|
+
# If so, then warn before overwriting.
|
41
|
+
#
|
42
|
+
# If not, then create the output file.
|
43
|
+
|
44
|
+
if File.exist?(output)
|
45
|
+
puts "WARNING: if #{input} already exists, then it will be overwritten."
|
46
|
+
puts "Press CTRL-C to cancel, or press RETURN to proceed."
|
47
|
+
print "[CRTL-C or RETURN?]"
|
48
|
+
STDIN.gets
|
49
|
+
|
50
|
+
puts "Opening #{output}."
|
51
|
+
target = File.open(output, 'w')
|
52
|
+
puts "Erasing contents of #{output}."
|
53
|
+
target.truncate(target.size)
|
54
|
+
else
|
55
|
+
puts "Creating #{output}."
|
56
|
+
target = File.open(output, 'w')
|
57
|
+
end
|
58
|
+
|
59
|
+
# Using regular expressions, extract the case volume, the reporter abbreviation, and the starting page number for (hopefully . . . ) each full case citation in the brief.
|
60
|
+
#
|
61
|
+
# The regular expressions should catch common variations, including spacing or lack of spacing in the reporter abbreviation, state-specific citation formats for New York and California, and Westlaw and LEXIS citations for unpublished rulings.
|
62
|
+
|
63
|
+
puts "Scraping case citations from #{input}."
|
64
|
+
text.scan(/\d+\s\i*\w+\.+\s*\w*\.*\s*\w*\.*\s*\w*\.*\s\d+/) { |w| target.write "#{w}\n" }
|
65
|
+
|
66
|
+
puts "Writing case citations to #{output}."
|
67
|
+
|
68
|
+
# Close the output file after writing.
|
69
|
+
|
70
|
+
target.close()
|
71
|
+
|
72
|
+
# Open the output file in the user's default text editor.
|
73
|
+
#
|
74
|
+
# The citation list, one per line, is in the appropriate format for cutting and pasting directly into Westlaw "Find & Print." (The maximum number of lines for a single query is 99.)
|
75
|
+
|
76
|
+
%x{ call #{output} }
|
77
|
+
|
78
|
+
Process.exit()
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: case_scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Greg Kochansky
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-10-02 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: yomu
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ! '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
description: Scrape case citations from legal briefs in rtf PDF format using the Yomu
|
28
|
+
text extraction library.
|
29
|
+
email: greg@greg-k.com
|
30
|
+
executables: []
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- lib/case_scraper.rb
|
35
|
+
homepage: http://github.com/gkochans/case_scraper
|
36
|
+
licenses:
|
37
|
+
- MIT
|
38
|
+
metadata: {}
|
39
|
+
post_install_message:
|
40
|
+
rdoc_options: []
|
41
|
+
require_paths:
|
42
|
+
- lib
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ! '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ! '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
requirements: []
|
54
|
+
rubyforge_project:
|
55
|
+
rubygems_version: 2.0.7
|
56
|
+
signing_key:
|
57
|
+
specification_version: 4
|
58
|
+
summary: Scrape case citations from legal briefs in rtf format.
|
59
|
+
test_files: []
|