case_scraper 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/case_scraper.rb +55 -13
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9326a2861a86a4f00775fe88c5017036847c688f
|
4
|
+
data.tar.gz: 37918563e595a7bc4022af73214b826f354b0d50
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3590bc69958a214affe3a1760215c73d8d45fa9ae41e4119a982a6512bac1df1b182631736025036c1b7b7ef1025bac9f172cd88f0690f1db2decb87d1bd1171
|
7
|
+
data.tar.gz: a65d66e841a8d06083864591c4c81989a2e87e8ba5b394e08bd8510b8e3579807cc443654fa563b2ec6b546f7d68b52dab32a17c8052099518f5727ab80675e6
|
data/lib/case_scraper.rb
CHANGED
@@ -1,12 +1,33 @@
|
|
1
1
|
# case_scraper
|
2
2
|
#
|
3
|
-
# This script uses the Yomu library
|
3
|
+
# This script uses the Yomu library
|
4
|
+
# to extract the text from a legal
|
5
|
+
# brief in RTF format. It then uses
|
6
|
+
# regular expressions to search the
|
7
|
+
# document for case citations and
|
8
|
+
# extract the volume, reporter, and
|
9
|
+
# starting page for each case. The case
|
10
|
+
# citations are then output to a text
|
11
|
+
# file, with one citation per line.
|
4
12
|
#
|
5
|
-
# This is an experimental script. Use
|
13
|
+
# This is an experimental script. Use
|
14
|
+
# at your own risk and please be sure
|
15
|
+
# to check the output against the
|
16
|
+
# list of cases in the original brief
|
17
|
+
# to ensure you have a complete list.
|
6
18
|
#
|
7
|
-
# THIS SCRIPT ONLY FOCUSES ON
|
19
|
+
# THIS SCRIPT ONLY FOCUSES ON
|
20
|
+
# REPORTED U.S. STATE AND FEDERAL
|
21
|
+
# CASELAW AND UNREPORTED U.S. STATE
|
22
|
+
# AND FEDERAL CASELAW FOR WHICH A
|
23
|
+
# LEXIS OR WESTLAW CITATION IS
|
24
|
+
# AVAILABLE. IT IS NOT DESIGNED TO
|
25
|
+
# DETECT CITATIONS TO CASELAW FROM
|
26
|
+
# ANY OTHER JURISDICTION.
|
8
27
|
#
|
9
|
-
# THIS SCRIPT ONLY EXTRACTS CASE
|
28
|
+
# THIS SCRIPT ONLY EXTRACTS CASE
|
29
|
+
# CITATIONS, NOT CITATIONS FOR
|
30
|
+
# STATUTES OR OTHER AUTHORITIES.
|
10
31
|
#
|
11
32
|
# Author: Greg Kochansky
|
12
33
|
# E-mail: greg@greg-k.com
|
@@ -17,7 +38,8 @@
|
|
17
38
|
|
18
39
|
require 'yomu'
|
19
40
|
|
20
|
-
# Test that input and output files
|
41
|
+
# Test that input and output files
|
42
|
+
# are specified.
|
21
43
|
|
22
44
|
unless ARGV.length == 2
|
23
45
|
puts "You must include an input filename (RTF format) and an output filename (TXT format)."
|
@@ -25,24 +47,27 @@ unless ARGV.length == 2
|
|
25
47
|
exit
|
26
48
|
end
|
27
49
|
|
28
|
-
# Set the two command line arguments
|
50
|
+
# Set the two command line arguments
|
51
|
+
# as variables.
|
29
52
|
|
30
53
|
input = ARGV[0]
|
31
54
|
output = ARGV[1]
|
32
55
|
|
33
|
-
# Import body text from the input RTF
|
56
|
+
# Import body text from the input RTF
|
57
|
+
# file.
|
34
58
|
|
35
59
|
data = File.read input
|
36
60
|
text = Yomu.read :text, data
|
37
61
|
|
38
|
-
# Test whether output file already
|
62
|
+
# Test whether output file already
|
63
|
+
# exists.
|
39
64
|
#
|
40
65
|
# If so, then warn before overwriting.
|
41
66
|
#
|
42
67
|
# If not, then create the output file.
|
43
68
|
|
44
69
|
if File.exist?(output)
|
45
|
-
puts "WARNING:
|
70
|
+
puts "WARNING: #{input} already exists. It will be overwritten."
|
46
71
|
puts "Press CTRL-C to cancel, or press RETURN to proceed."
|
47
72
|
print "[CRTL-C or RETURN?]"
|
48
73
|
STDIN.gets
|
@@ -56,9 +81,20 @@ else
|
|
56
81
|
target = File.open(output, 'w')
|
57
82
|
end
|
58
83
|
|
59
|
-
# Using regular expressions, extract
|
84
|
+
# Using regular expressions, extract
|
85
|
+
# the case volume, the reporter
|
86
|
+
# abbreviation, and the starting page
|
87
|
+
# number for (hopefully . . . ) each
|
88
|
+
# full case citation in the brief.
|
60
89
|
#
|
61
|
-
# The regular expressions should
|
90
|
+
# The regular expressions should
|
91
|
+
# catch common variations, including
|
92
|
+
# spacing or lack of spacing in the
|
93
|
+
# reporter abbreviation,
|
94
|
+
# state-specific citation formats for
|
95
|
+
# New York and California, and
|
96
|
+
# Westlaw and LEXIS citations for
|
97
|
+
# unpublished rulings.
|
62
98
|
|
63
99
|
puts "Scraping case citations from #{input}."
|
64
100
|
text.scan(/\d+\s\i*\w+\.+\s*\w*\.*\s*\w*\.*\s*\w*\.*\s\d+/) { |w| target.write "#{w}\n" }
|
@@ -69,9 +105,15 @@ puts "Writing case citations to #{output}."
|
|
69
105
|
|
70
106
|
target.close()
|
71
107
|
|
72
|
-
# Open the output file in the user's
|
108
|
+
# Open the output file in the user's
|
109
|
+
# default text editor.
|
73
110
|
#
|
74
|
-
# The citation list, one per line, is
|
111
|
+
# The citation list, one per line, is
|
112
|
+
# in the appropriate format for
|
113
|
+
# cutting and pasting directly into
|
114
|
+
# Westlaw "Find & Print." (The
|
115
|
+
# maximum number of lines for a
|
116
|
+
# single query is 99.)
|
75
117
|
|
76
118
|
%x{ call #{output} }
|
77
119
|
|