case_scraper 0.0.4 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +14 -6
- data/lib/case_scraper.rb +120 -18
- metadata +9 -10
checksums.yaml
CHANGED
@@ -1,7 +1,15 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
NGQ0ODk4YjUzYTJjNWE0YTU0MGY3ZDAyMjE4ZGY3MTM3MjY1ODE1Ng==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YTQ4MmMxNDk2MmEwOWE2N2YzOTFhYThlMTQ0NTY0YWRlZTVjYzQ2MQ==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
ZGFkODBjYTU2NGE0YWU0OTg0OTcxYmVlNWNjNGI4Y2I5ODQwMzI5ZDQ5Yjdj
|
10
|
+
MDVjMDkyZWRiZDVhMzAwNWJkYmY1ZjlmZGRhMWZlMjIzYTlmMjk4MWFiNDgz
|
11
|
+
OGMyODAyYjM1OTE2ZDk3ZDQ4M2RiMmI2NTllZTNiYjc2MGY2ZWM=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
YTNjYzI1YmVkOWVlMjVmNDMwYTczMDRhODJkZDQzYjUwZTI4N2EwYjBlZDVk
|
14
|
+
NTA1NWQ1NzhiZWYzNmJmYmYwMDY0OTMwYzQ1OGVhNzU3OGYwYmMyZWFkZjQ5
|
15
|
+
YjQwODVlYjhlZTBhZDJjNjBhOThiN2ZiMzg1MWVjNWQzZTMxN2M=
|
data/lib/case_scraper.rb
CHANGED
@@ -2,13 +2,22 @@
|
|
2
2
|
#
|
3
3
|
# This script uses the Yomu library
|
4
4
|
# to extract the text from a legal
|
5
|
-
# brief in RTF format.
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
|
10
|
-
#
|
11
|
-
#
|
5
|
+
# brief that is in RTF format. The
|
6
|
+
# script then applies regular
|
7
|
+
# expressions to search the text of
|
8
|
+
# the document for case citations.
|
9
|
+
|
10
|
+
# You have the option of extracting
|
11
|
+
# only the volume, reporter, and starting
|
12
|
+
# page number for each case, or extracting
|
13
|
+
# the entire citation (though this is
|
14
|
+
# imperfect) along with the organizational
|
15
|
+
# headings from the brief, with the case
|
16
|
+
# citations each placed under the
|
17
|
+
# appropriate section or subsection. The
|
18
|
+
# case citations and heading text are
|
19
|
+
# then output to a text file, with one
|
20
|
+
# citation or heading per line.
|
12
21
|
#
|
13
22
|
# This is an experimental script. Use
|
14
23
|
# at your own risk and please be sure
|
@@ -37,16 +46,57 @@
|
|
37
46
|
# Require the external Yomu library.
|
38
47
|
|
39
48
|
require 'yomu'
|
49
|
+
require 'optparse'
|
50
|
+
|
51
|
+
# Define options for toggling citation
|
52
|
+
# type and inclusion of headings in the
|
53
|
+
# brief.
|
54
|
+
|
55
|
+
options = {}
|
56
|
+
|
57
|
+
optparse = OptionParser.new do|opts|
|
58
|
+
# Help banner.
|
59
|
+
opts.banner = "Usage: case_scraper.rb [options] InputFile.rtf OutputFile.txt ..."
|
60
|
+
|
61
|
+
# Define options.
|
62
|
+
options[:fullcite] = false
|
63
|
+
opts.on( '-f', '--full', 'Scrape complete case citations with headings.' ) do
|
64
|
+
options[:fullcite] = true
|
65
|
+
end
|
66
|
+
|
67
|
+
opts.on( '-h', '--help', 'Display help.' ) do
|
68
|
+
puts opts
|
69
|
+
exit
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Parse the command-line and remove options from ARGV array.
|
74
|
+
optparse.parse!
|
40
75
|
|
41
76
|
# Test that input and output files
|
42
77
|
# are specified.
|
43
78
|
|
44
79
|
unless ARGV.length == 2
|
45
80
|
puts "You must include an input filename (RTF format) and an output filename (TXT format)."
|
46
|
-
puts "Usage: ruby
|
81
|
+
puts "Usage: ruby case_scraper.rb InputFile.rtf OutputFile.txt\n"
|
47
82
|
exit
|
48
83
|
end
|
49
84
|
|
85
|
+
if options[:fullcite]
|
86
|
+
|
87
|
+
# Regexp for scraping the entire case citation
|
88
|
+
|
89
|
+
citationregex = "(((([a-z]{3}(\\.|\\?|!)\"?)|see also|see|accord)\\s)|^[A-Z][a-z]*\\s([Vv][\\.]*\\s))[A-Z].{1,125}(\\,|\\s(\\[\\d{4,4}\\])|\\s\\(\\d{4,4}\\)|\\s)\\s\\d{1,4}\\s[A-Za-z0-9\\.\\s]{3,17}\\d{1,8}(\\s?|\\,|\\.)((\\(|\\[)([A-Za-z0-9\\.\\,\\s]{1,20})*\\d{4,4}(\\]|\\))*)*|^\\s*[A-Za-z0-9]{1,4}\\.\\s.{1,150}$"
|
90
|
+
else
|
91
|
+
|
92
|
+
# Regexp for scraping the just the volume,
|
93
|
+
# reporter, and first page number.
|
94
|
+
|
95
|
+
citationregex = "\\d{1,4}\\s\[^\\[\\(]{1,16}\\s\\d{1,7}"
|
96
|
+
end
|
97
|
+
|
98
|
+
# Regexp for scraping the brief headings.
|
99
|
+
|
50
100
|
# Set the two command line arguments
|
51
101
|
# as variables.
|
52
102
|
|
@@ -82,22 +132,67 @@ else
|
|
82
132
|
end
|
83
133
|
|
84
134
|
# Using regular expressions, extract
|
85
|
-
# the
|
86
|
-
#
|
87
|
-
# number for (hopefully . . . ) each
|
88
|
-
# full case citation in the brief.
|
89
|
-
#
|
135
|
+
# the citations and headings, per
|
136
|
+
# the specified options.
|
90
137
|
# The regular expressions should
|
91
138
|
# catch common variations, including
|
92
139
|
# spacing or lack of spacing in the
|
93
140
|
# reporter abbreviation,
|
94
141
|
# state-specific citation formats for
|
95
142
|
# New York and California, and
|
96
|
-
#
|
97
|
-
# unpublished rulings.
|
143
|
+
# WestLaw and LEXIS citations for
|
144
|
+
# unpublished rulings. As you may see
|
145
|
+
# if you run the example file, the script
|
146
|
+
# is imperfect and may over-select when
|
147
|
+
# dealing with short sentences or
|
148
|
+
# certain citations at the beginning
|
149
|
+
# of a new line.
|
98
150
|
|
99
151
|
puts "Scraping case citations from #{input}."
|
100
|
-
|
152
|
+
|
153
|
+
# Alternative processing procedures
|
154
|
+
# depending on whether the script is
|
155
|
+
# extracting full or partial citations.
|
156
|
+
# Slightly more processing is required
|
157
|
+
# for full citations because the regexp
|
158
|
+
# has a lot of nested captures and the
|
159
|
+
# resulting array is more complex.
|
160
|
+
|
161
|
+
if options[:fullcite]
|
162
|
+
allcapture_array = text.scan(/(#{citationregex})/)
|
163
|
+
|
164
|
+
fullcase_array = allcapture_array.transpose
|
165
|
+
|
166
|
+
fullcases_only_array = fullcase_array.first
|
167
|
+
|
168
|
+
fullcases_only_array.flatten
|
169
|
+
|
170
|
+
fullcases_only_array.each do |a|
|
171
|
+
a.gsub!(/^[a-z]{3,3}\."?\s/, "")
|
172
|
+
end
|
173
|
+
|
174
|
+
fullcases_only_array.reject! { |c| c.empty? }
|
175
|
+
|
176
|
+
puts "Writing case citations to #{output}."
|
177
|
+
|
178
|
+
File.open(target, "w+") do |f|
|
179
|
+
f.puts(fullcases_only_array)
|
180
|
+
end
|
181
|
+
|
182
|
+
else
|
183
|
+
allcapture_array = text.scan(/(#{citationregex})/)
|
184
|
+
fullcase_array = allcapture_array.transpose
|
185
|
+
|
186
|
+
fullcases_only_array = fullcase_array.first
|
187
|
+
|
188
|
+
fullcases_only_array.reject! { |c| c.empty? }
|
189
|
+
|
190
|
+
|
191
|
+
fullcases_only_array.flatten
|
192
|
+
File.open(target, "w+") do |f|
|
193
|
+
f.puts(fullcases_only_array)
|
194
|
+
end
|
195
|
+
end
|
101
196
|
|
102
197
|
puts "Writing case citations to #{output}."
|
103
198
|
|
@@ -108,12 +203,19 @@ target.close()
|
|
108
203
|
# Open the output file in the user's
|
109
204
|
# default text editor.
|
110
205
|
#
|
111
|
-
#
|
112
|
-
#
|
206
|
+
# If the chosen output is a bare,
|
207
|
+
# partial citation list, then the
|
208
|
+
# output, with one citation per line,
|
209
|
+
# is in the appropriate format for
|
113
210
|
# cutting and pasting directly into
|
114
211
|
# Westlaw "Find & Print." (The
|
115
212
|
# maximum number of lines for a
|
116
213
|
# single query is 99.)
|
214
|
+
#
|
215
|
+
# For full citations with headings,
|
216
|
+
# the output should save you time in
|
217
|
+
# structuring your opposition / reply
|
218
|
+
# brief.
|
117
219
|
|
118
220
|
%x{ call #{output} }
|
119
221
|
|
metadata
CHANGED
@@ -1,31 +1,31 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: case_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Greg Kochansky
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-10-
|
11
|
+
date: 2013-10-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: yomu
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - '>='
|
17
|
+
- - ! '>='
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - '>='
|
24
|
+
- - ! '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
-
description: Scrape case citations
|
28
|
-
|
27
|
+
description: Scrape case citations and section headings from legal briefs in *.rtf
|
28
|
+
format.
|
29
29
|
email: greg@greg-k.com
|
30
30
|
executables: []
|
31
31
|
extensions: []
|
@@ -42,12 +42,12 @@ require_paths:
|
|
42
42
|
- lib
|
43
43
|
required_ruby_version: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - '>='
|
45
|
+
- - ! '>='
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '0'
|
48
48
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
49
|
requirements:
|
50
|
-
- - '>='
|
50
|
+
- - ! '>='
|
51
51
|
- !ruby/object:Gem::Version
|
52
52
|
version: '0'
|
53
53
|
requirements: []
|
@@ -55,6 +55,5 @@ rubyforge_project:
|
|
55
55
|
rubygems_version: 2.0.7
|
56
56
|
signing_key:
|
57
57
|
specification_version: 4
|
58
|
-
summary: Scrape case citations
|
59
|
-
legal briefs that are in rtf format.
|
58
|
+
summary: Scrape case citations and section headings from legal briefs in *.rtf format.
|
60
59
|
test_files: []
|