case_scraper 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/case_scraper.rb +55 -13
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9326a2861a86a4f00775fe88c5017036847c688f
|
4
|
+
data.tar.gz: 37918563e595a7bc4022af73214b826f354b0d50
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3590bc69958a214affe3a1760215c73d8d45fa9ae41e4119a982a6512bac1df1b182631736025036c1b7b7ef1025bac9f172cd88f0690f1db2decb87d1bd1171
|
7
|
+
data.tar.gz: a65d66e841a8d06083864591c4c81989a2e87e8ba5b394e08bd8510b8e3579807cc443654fa563b2ec6b546f7d68b52dab32a17c8052099518f5727ab80675e6
|
data/lib/case_scraper.rb
CHANGED
@@ -1,12 +1,33 @@
|
|
1
1
|
# case_scraper
|
2
2
|
#
|
3
|
-
# This script uses the Yomu library
|
3
|
+
# This script uses the Yomu library
|
4
|
+
# to extract the text from a legal
|
5
|
+
# brief in RTF format. It then uses
|
6
|
+
# regular expressions to search the
|
7
|
+
# document for case citations and
|
8
|
+
# extract the volume, reporter, and
|
9
|
+
# starting page for each case. The case
|
10
|
+
# citations are then output to a text
|
11
|
+
# file, with one citation per line.
|
4
12
|
#
|
5
|
-
# This is an experimental script. Use
|
13
|
+
# This is an experimental script. Use
|
14
|
+
# at your own risk and please be sure
|
15
|
+
# to check the output against the
|
16
|
+
# list of cases in the original brief
|
17
|
+
# to ensure you have a complete list.
|
6
18
|
#
|
7
|
-
# THIS SCRIPT ONLY FOCUSES ON
|
19
|
+
# THIS SCRIPT ONLY FOCUSES ON
|
20
|
+
# REPORTED U.S. STATE AND FEDERAL
|
21
|
+
# CASELAW AND UNREPORTED U.S. STATE
|
22
|
+
# AND FEDERAL CASELAW FOR WHICH A
|
23
|
+
# LEXIS OR WESTLAW CITATION IS
|
24
|
+
# AVAILABLE. IT IS NOT DESIGNED TO
|
25
|
+
# DETECT CITATIONS TO CASELAW FROM
|
26
|
+
# ANY OTHER JURISDICTION.
|
8
27
|
#
|
9
|
-
# THIS SCRIPT ONLY EXTRACTS CASE
|
28
|
+
# THIS SCRIPT ONLY EXTRACTS CASE
|
29
|
+
# CITATIONS, NOT CITATIONS FOR
|
30
|
+
# STATUTES OR OTHER AUTHORITIES.
|
10
31
|
#
|
11
32
|
# Author: Greg Kochansky
|
12
33
|
# E-mail: greg@greg-k.com
|
@@ -17,7 +38,8 @@
|
|
17
38
|
|
18
39
|
require 'yomu'
|
19
40
|
|
20
|
-
# Test that input and output files
|
41
|
+
# Test that input and output files
|
42
|
+
# are specified.
|
21
43
|
|
22
44
|
unless ARGV.length == 2
|
23
45
|
puts "You must include an input filename (RTF format) and an output filename (TXT format)."
|
@@ -25,24 +47,27 @@ unless ARGV.length == 2
|
|
25
47
|
exit
|
26
48
|
end
|
27
49
|
|
28
|
-
# Set the two command line arguments
|
50
|
+
# Set the two command line arguments
|
51
|
+
# as variables.
|
29
52
|
|
30
53
|
input = ARGV[0]
|
31
54
|
output = ARGV[1]
|
32
55
|
|
33
|
-
# Import body text from the input RTF
|
56
|
+
# Import body text from the input RTF
|
57
|
+
# file.
|
34
58
|
|
35
59
|
data = File.read input
|
36
60
|
text = Yomu.read :text, data
|
37
61
|
|
38
|
-
# Test whether output file already
|
62
|
+
# Test whether output file already
|
63
|
+
# exists.
|
39
64
|
#
|
40
65
|
# If so, then warn before overwriting.
|
41
66
|
#
|
42
67
|
# If not, then create the output file.
|
43
68
|
|
44
69
|
if File.exist?(output)
|
45
|
-
puts "WARNING:
|
70
|
+
puts "WARNING: #{input} already exists. It will be overwritten."
|
46
71
|
puts "Press CTRL-C to cancel, or press RETURN to proceed."
|
47
72
|
print "[CRTL-C or RETURN?]"
|
48
73
|
STDIN.gets
|
@@ -56,9 +81,20 @@ else
|
|
56
81
|
target = File.open(output, 'w')
|
57
82
|
end
|
58
83
|
|
59
|
-
# Using regular expressions, extract
|
84
|
+
# Using regular expressions, extract
|
85
|
+
# the case volume, the reporter
|
86
|
+
# abbreviation, and the starting page
|
87
|
+
# number for (hopefully . . . ) each
|
88
|
+
# full case citation in the brief.
|
60
89
|
#
|
61
|
-
# The regular expressions should
|
90
|
+
# The regular expressions should
|
91
|
+
# catch common variations, including
|
92
|
+
# spacing or lack of spacing in the
|
93
|
+
# reporter abbreviation,
|
94
|
+
# state-specific citation formats for
|
95
|
+
# New York and California, and
|
96
|
+
# Westlaw and LEXIS citations for
|
97
|
+
# unpublished rulings.
|
62
98
|
|
63
99
|
puts "Scraping case citations from #{input}."
|
64
100
|
text.scan(/\d+\s\i*\w+\.+\s*\w*\.*\s*\w*\.*\s*\w*\.*\s\d+/) { |w| target.write "#{w}\n" }
|
@@ -69,9 +105,15 @@ puts "Writing case citations to #{output}."
|
|
69
105
|
|
70
106
|
target.close()
|
71
107
|
|
72
|
-
# Open the output file in the user's
|
108
|
+
# Open the output file in the user's
|
109
|
+
# default text editor.
|
73
110
|
#
|
74
|
-
# The citation list, one per line, is
|
111
|
+
# The citation list, one per line, is
|
112
|
+
# in the appropriate format for
|
113
|
+
# cutting and pasting directly into
|
114
|
+
# Westlaw "Find & Print." (The
|
115
|
+
# maximum number of lines for a
|
116
|
+
# single query is 99.)
|
75
117
|
|
76
118
|
%x{ call #{output} }
|
77
119
|
|