case_scraper 0.0.4 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +14 -6
- data/lib/case_scraper.rb +120 -18
- metadata +9 -10
checksums.yaml
CHANGED
@@ -1,7 +1,15 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
NGQ0ODk4YjUzYTJjNWE0YTU0MGY3ZDAyMjE4ZGY3MTM3MjY1ODE1Ng==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YTQ4MmMxNDk2MmEwOWE2N2YzOTFhYThlMTQ0NTY0YWRlZTVjYzQ2MQ==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
ZGFkODBjYTU2NGE0YWU0OTg0OTcxYmVlNWNjNGI4Y2I5ODQwMzI5ZDQ5Yjdj
|
10
|
+
MDVjMDkyZWRiZDVhMzAwNWJkYmY1ZjlmZGRhMWZlMjIzYTlmMjk4MWFiNDgz
|
11
|
+
OGMyODAyYjM1OTE2ZDk3ZDQ4M2RiMmI2NTllZTNiYjc2MGY2ZWM=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
YTNjYzI1YmVkOWVlMjVmNDMwYTczMDRhODJkZDQzYjUwZTI4N2EwYjBlZDVk
|
14
|
+
NTA1NWQ1NzhiZWYzNmJmYmYwMDY0OTMwYzQ1OGVhNzU3OGYwYmMyZWFkZjQ5
|
15
|
+
YjQwODVlYjhlZTBhZDJjNjBhOThiN2ZiMzg1MWVjNWQzZTMxN2M=
|
data/lib/case_scraper.rb
CHANGED
@@ -2,13 +2,22 @@
|
|
2
2
|
#
|
3
3
|
# This script uses the Yomu library
|
4
4
|
# to extract the text from a legal
|
5
|
-
# brief in RTF format.
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
|
10
|
-
#
|
11
|
-
#
|
5
|
+
# brief that is in RTF format. The
|
6
|
+
# script then applies regular
|
7
|
+
# expressions to search the text of
|
8
|
+
# the document for case citations.
|
9
|
+
|
10
|
+
# You have the option of extracting
|
11
|
+
# only the volume, reporter, and starting
|
12
|
+
# page number for each case, or extracting
|
13
|
+
# the entire citation (though this is
|
14
|
+
# imperfect) along with the organizational
|
15
|
+
# headings from the brief, with the case
|
16
|
+
# citations each placed under the
|
17
|
+
# appropriate section or subsection. The
|
18
|
+
# case citations and heading text are
|
19
|
+
# then output to a text file, with one
|
20
|
+
# citation or heading per line.
|
12
21
|
#
|
13
22
|
# This is an experimental script. Use
|
14
23
|
# at your own risk and please be sure
|
@@ -37,16 +46,57 @@
|
|
37
46
|
# Require the external Yomu library.
|
38
47
|
|
39
48
|
require 'yomu'
|
49
|
+
require 'optparse'
|
50
|
+
|
51
|
+
# Define options for toggling citation
|
52
|
+
# type and inclusion of headings in the
|
53
|
+
# brief.
|
54
|
+
|
55
|
+
options = {}
|
56
|
+
|
57
|
+
optparse = OptionParser.new do|opts|
|
58
|
+
# Help banner.
|
59
|
+
opts.banner = "Usage: case_scraper.rb [options] InputFile.rtf OutputFile.txt ..."
|
60
|
+
|
61
|
+
# Define options.
|
62
|
+
options[:fullcite] = false
|
63
|
+
opts.on( '-f', '--full', 'Scrape complete case citations with headings.' ) do
|
64
|
+
options[:fullcite] = true
|
65
|
+
end
|
66
|
+
|
67
|
+
opts.on( '-h', '--help', 'Display help.' ) do
|
68
|
+
puts opts
|
69
|
+
exit
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Parse the command-line and remove options from ARGV array.
|
74
|
+
optparse.parse!
|
40
75
|
|
41
76
|
# Test that input and output files
|
42
77
|
# are specified.
|
43
78
|
|
44
79
|
unless ARGV.length == 2
|
45
80
|
puts "You must include an input filename (RTF format) and an output filename (TXT format)."
|
46
|
-
puts "Usage: ruby
|
81
|
+
puts "Usage: ruby case_scraper.rb InputFile.rtf OutputFile.txt\n"
|
47
82
|
exit
|
48
83
|
end
|
49
84
|
|
85
|
+
if options[:fullcite]
|
86
|
+
|
87
|
+
# Regexp for scraping the entire case citation
|
88
|
+
|
89
|
+
citationregex = "(((([a-z]{3}(\\.|\\?|!)\"?)|see also|see|accord)\\s)|^[A-Z][a-z]*\\s([Vv][\\.]*\\s))[A-Z].{1,125}(\\,|\\s(\\[\\d{4,4}\\])|\\s\\(\\d{4,4}\\)|\\s)\\s\\d{1,4}\\s[A-Za-z0-9\\.\\s]{3,17}\\d{1,8}(\\s?|\\,|\\.)((\\(|\\[)([A-Za-z0-9\\.\\,\\s]{1,20})*\\d{4,4}(\\]|\\))*)*|^\\s*[A-Za-z0-9]{1,4}\\.\\s.{1,150}$"
|
90
|
+
else
|
91
|
+
|
92
|
+
# Regexp for scraping the just the volume,
|
93
|
+
# reporter, and first page number.
|
94
|
+
|
95
|
+
citationregex = "\\d{1,4}\\s\[^\\[\\(]{1,16}\\s\\d{1,7}"
|
96
|
+
end
|
97
|
+
|
98
|
+
# Regexp for scraping the brief headings.
|
99
|
+
|
50
100
|
# Set the two command line arguments
|
51
101
|
# as variables.
|
52
102
|
|
@@ -82,22 +132,67 @@ else
|
|
82
132
|
end
|
83
133
|
|
84
134
|
# Using regular expressions, extract
|
85
|
-
# the
|
86
|
-
#
|
87
|
-
# number for (hopefully . . . ) each
|
88
|
-
# full case citation in the brief.
|
89
|
-
#
|
135
|
+
# the citations and headings, per
|
136
|
+
# the specified options.
|
90
137
|
# The regular expressions should
|
91
138
|
# catch common variations, including
|
92
139
|
# spacing or lack of spacing in the
|
93
140
|
# reporter abbreviation,
|
94
141
|
# state-specific citation formats for
|
95
142
|
# New York and California, and
|
96
|
-
#
|
97
|
-
# unpublished rulings.
|
143
|
+
# WestLaw and LEXIS citations for
|
144
|
+
# unpublished rulings. As you may see
|
145
|
+
# if you run the example file, the script
|
146
|
+
# is imperfect and may over-select when
|
147
|
+
# dealing with short sentences or
|
148
|
+
# certain citations at the beginning
|
149
|
+
# of a new line.
|
98
150
|
|
99
151
|
puts "Scraping case citations from #{input}."
|
100
|
-
|
152
|
+
|
153
|
+
# Alternative processing procedures
|
154
|
+
# depending on whether the script is
|
155
|
+
# extracting full or partial citations.
|
156
|
+
# Slightly more processing is required
|
157
|
+
# for full citations because the regexp
|
158
|
+
# has a lot of nested captures and the
|
159
|
+
# resulting array is more complex.
|
160
|
+
|
161
|
+
if options[:fullcite]
|
162
|
+
allcapture_array = text.scan(/(#{citationregex})/)
|
163
|
+
|
164
|
+
fullcase_array = allcapture_array.transpose
|
165
|
+
|
166
|
+
fullcases_only_array = fullcase_array.first
|
167
|
+
|
168
|
+
fullcases_only_array.flatten
|
169
|
+
|
170
|
+
fullcases_only_array.each do |a|
|
171
|
+
a.gsub!(/^[a-z]{3,3}\."?\s/, "")
|
172
|
+
end
|
173
|
+
|
174
|
+
fullcases_only_array.reject! { |c| c.empty? }
|
175
|
+
|
176
|
+
puts "Writing case citations to #{output}."
|
177
|
+
|
178
|
+
File.open(target, "w+") do |f|
|
179
|
+
f.puts(fullcases_only_array)
|
180
|
+
end
|
181
|
+
|
182
|
+
else
|
183
|
+
allcapture_array = text.scan(/(#{citationregex})/)
|
184
|
+
fullcase_array = allcapture_array.transpose
|
185
|
+
|
186
|
+
fullcases_only_array = fullcase_array.first
|
187
|
+
|
188
|
+
fullcases_only_array.reject! { |c| c.empty? }
|
189
|
+
|
190
|
+
|
191
|
+
fullcases_only_array.flatten
|
192
|
+
File.open(target, "w+") do |f|
|
193
|
+
f.puts(fullcases_only_array)
|
194
|
+
end
|
195
|
+
end
|
101
196
|
|
102
197
|
puts "Writing case citations to #{output}."
|
103
198
|
|
@@ -108,12 +203,19 @@ target.close()
|
|
108
203
|
# Open the output file in the user's
|
109
204
|
# default text editor.
|
110
205
|
#
|
111
|
-
#
|
112
|
-
#
|
206
|
+
# If the chosen output is a bare,
|
207
|
+
# partial citation list, then the
|
208
|
+
# output, with one citation per line,
|
209
|
+
# is in the appropriate format for
|
113
210
|
# cutting and pasting directly into
|
114
211
|
# Westlaw "Find & Print." (The
|
115
212
|
# maximum number of lines for a
|
116
213
|
# single query is 99.)
|
214
|
+
#
|
215
|
+
# For full citations with headings,
|
216
|
+
# the output should save you time in
|
217
|
+
# structuring your opposition / reply
|
218
|
+
# brief.
|
117
219
|
|
118
220
|
%x{ call #{output} }
|
119
221
|
|
metadata
CHANGED
@@ -1,31 +1,31 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: case_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Greg Kochansky
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-10-
|
11
|
+
date: 2013-10-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: yomu
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - '>='
|
17
|
+
- - ! '>='
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - '>='
|
24
|
+
- - ! '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
-
description: Scrape case citations
|
28
|
-
|
27
|
+
description: Scrape case citations and section headings from legal briefs in *.rtf
|
28
|
+
format.
|
29
29
|
email: greg@greg-k.com
|
30
30
|
executables: []
|
31
31
|
extensions: []
|
@@ -42,12 +42,12 @@ require_paths:
|
|
42
42
|
- lib
|
43
43
|
required_ruby_version: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - '>='
|
45
|
+
- - ! '>='
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '0'
|
48
48
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
49
|
requirements:
|
50
|
-
- - '>='
|
50
|
+
- - ! '>='
|
51
51
|
- !ruby/object:Gem::Version
|
52
52
|
version: '0'
|
53
53
|
requirements: []
|
@@ -55,6 +55,5 @@ rubyforge_project:
|
|
55
55
|
rubygems_version: 2.0.7
|
56
56
|
signing_key:
|
57
57
|
specification_version: 4
|
58
|
-
summary: Scrape case citations
|
59
|
-
legal briefs that are in rtf format.
|
58
|
+
summary: Scrape case citations and section headings from legal briefs in *.rtf format.
|
60
59
|
test_files: []
|