case_scraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +15 -0
  2. data/lib/case_scraper.rb +78 -0
  3. metadata +59 -0
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MTY1NmQ1OTYxNDI4ODdhZTgxYjBlYzkyY2FkODRjZmNmNTA1MTRkYQ==
5
+ data.tar.gz: !binary |-
6
+ MDMxNDkwYzUzNThkOTdlY2VjMjg5ZTVjYTA3Y2ZiMmJhMzhiODhhOQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ YmQ0MzAxYzhkMzVlNjQzYzc2MzBjYTBiMmVmZmI0MmYzMDhlYzA2MzY4NmQ5
10
+ MTdhYTc0OWRkM2QyZDllN2ZjNjkxYmRmNTRiM2Y0NmMwNWI0YTI2NmM1ZmI3
11
+ YzM2NzNkNWU2Y2Q2ODgwZjc1Y2FkNmEyNTI5N2M0ODlhMzNmZDk=
12
+ data.tar.gz: !binary |-
13
+ ZWE0YmQ0MGViZTdmYTg0NThhNTZkNDg2NzJhMTRlNmY3MGVjY2ZkNjU3M2Rm
14
+ MDUyMmUyZWZlZWQ1NDNjYjA5ZTYwMzUzYWEwOGYzMDYzZjhmMjA0ZTE2MjE2
15
+ N2YxMWNhMDNhMTY3ZGFkZmQ2NGRkOWMxYWFkNzNiMDcxMTFiN2U=
@@ -0,0 +1,78 @@
1
+ # case_scraper
2
+ #
3
+ # This script uses the Yomu library to extract the text from a legal brief in RTF format. It then uses regular expressions to search the document for case citations and extract the volume, reporter, and starting page for each case. The case citations are then output to a text file, with one citation per line.
4
+ #
5
+ # This is an experimental script. Use at your own risk and please be sure to check the output against the list of cases in the original brief to ensure you have a complete list.
6
+ #
7
+ # THIS SCRIPT ONLY FOCUSES ON REPORTED U.S. STATE AND FEDERAL CASELAW AND UNREPORTED U.S. STATE AND FEDERAL CASELAW FOR WHICH A LEXIS OR WESTLAW CITATION IS AVAILABLE. IT IS NOT DESIGNED TO DETECT CITATIONS TO CASELAW FROM ANY OTHER JURISDICTION.
8
+ #
9
+ # THIS SCRIPT ONLY EXTRACTS CASE CITATIONS, NOT CITATIONS FOR STATUTES OR OTHER AUTHORITIES.
10
+ #
11
+ # Author: Greg Kochansky
12
+ # E-mail: greg@greg-k.com
13
+ # Website: www.greg-k.com
14
+ # License: MIT
15
+
16
+ # Require the external Yomu library.
17
+
18
+ require 'yomu'
19
+
20
+ # Test that input and output files are specified.
21
+
22
+ unless ARGV.length == 2
23
+ puts "You must include an input filename (RTF format) and an output filename (TXT format)."
24
+ puts "Usage: ruby MyScript.rb InputFile.rtf OutputFile.txt\n"
25
+ exit
26
+ end
27
+
28
+ # Set the two command line arguments as variables.
29
+
30
+ input = ARGV[0]
31
+ output = ARGV[1]
32
+
33
+ # Import body text from the input RTF file.
34
+
35
+ data = File.read input
36
+ text = Yomu.read :text, data
37
+
38
+ # Test whether output file already exists.
39
+ #
40
+ # If so, then warn before overwriting.
41
+ #
42
+ # If not, then create the output file.
43
+
44
+ if File.exist?(output)
45
+ puts "WARNING: if #{input} already exists, then it will be overwritten."
46
+ puts "Press CTRL-C to cancel, or press RETURN to proceed."
47
+ print "[CRTL-C or RETURN?]"
48
+ STDIN.gets
49
+
50
+ puts "Opening #{output}."
51
+ target = File.open(output, 'w')
52
+ puts "Erasing contents of #{output}."
53
+ target.truncate(target.size)
54
+ else
55
+ puts "Creating #{output}."
56
+ target = File.open(output, 'w')
57
+ end
58
+
59
+ # Using regular expressions, extract the case volume, the reporter abbreviation, and the starting page number for (hopefully . . . ) each full case citation in the brief.
60
+ #
61
+ # The regular expressions should catch common variations, including spacing or lack of spacing in the reporter abbreviation, state-specific citation formats for New York and California, and Westlaw and LEXIS citations for unpublished rulings.
62
+
63
+ puts "Scraping case citations from #{input}."
64
+ text.scan(/\d+\s\i*\w+\.+\s*\w*\.*\s*\w*\.*\s*\w*\.*\s\d+/) { |w| target.write "#{w}\n" }
65
+
66
+ puts "Writing case citations to #{output}."
67
+
68
+ # Close the output file after writing.
69
+
70
+ target.close()
71
+
72
+ # Open the output file in the user's default text editor.
73
+ #
74
+ # The citation list, one per line, is in the appropriate format for cutting and pasting directly into Westlaw "Find & Print." (The maximum number of lines for a single query is 99.)
75
+
76
+ %x{ call #{output} }
77
+
78
+ Process.exit()
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: case_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Greg Kochansky
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-10-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: yomu
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ description: Scrape case citations from legal briefs in rtf PDF format using the Yomu
28
+ text extraction library.
29
+ email: greg@greg-k.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - lib/case_scraper.rb
35
+ homepage: http://github.com/gkochans/case_scraper
36
+ licenses:
37
+ - MIT
38
+ metadata: {}
39
+ post_install_message:
40
+ rdoc_options: []
41
+ require_paths:
42
+ - lib
43
+ required_ruby_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ! '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ! '>='
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ requirements: []
54
+ rubyforge_project:
55
+ rubygems_version: 2.0.7
56
+ signing_key:
57
+ specification_version: 4
58
+ summary: Scrape case citations from legal briefs in rtf format.
59
+ test_files: []