case_scraper 0.0.4 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +14 -6
  2. data/lib/case_scraper.rb +120 -18
  3. metadata +9 -10
checksums.yaml CHANGED
@@ -1,7 +1,15 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 79b5532947888d98823625fd8d4dce3dad260cab
4
- data.tar.gz: 6d43160afde9bbd261b7f3d1dfca49af3b913496
5
- SHA512:
6
- metadata.gz: 3f0e434c0fd38294e9ee47080483e4df2428e7f640de3c6742798d2649bbbcf73b165e6f52e5b2b8bba0a930dbd8e997594395564e39039f1c2126e82b5cff5b
7
- data.tar.gz: 3625f884841744b71d5f29f3a6b1adfee0b4882c1846116f37953d1ea1fa57dad04e5cd7151ea59d1506f7738270e3b29231883d600e3bedd36538fd6653868c
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ NGQ0ODk4YjUzYTJjNWE0YTU0MGY3ZDAyMjE4ZGY3MTM3MjY1ODE1Ng==
5
+ data.tar.gz: !binary |-
6
+ YTQ4MmMxNDk2MmEwOWE2N2YzOTFhYThlMTQ0NTY0YWRlZTVjYzQ2MQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ ZGFkODBjYTU2NGE0YWU0OTg0OTcxYmVlNWNjNGI4Y2I5ODQwMzI5ZDQ5Yjdj
10
+ MDVjMDkyZWRiZDVhMzAwNWJkYmY1ZjlmZGRhMWZlMjIzYTlmMjk4MWFiNDgz
11
+ OGMyODAyYjM1OTE2ZDk3ZDQ4M2RiMmI2NTllZTNiYjc2MGY2ZWM=
12
+ data.tar.gz: !binary |-
13
+ YTNjYzI1YmVkOWVlMjVmNDMwYTczMDRhODJkZDQzYjUwZTI4N2EwYjBlZDVk
14
+ NTA1NWQ1NzhiZWYzNmJmYmYwMDY0OTMwYzQ1OGVhNzU3OGYwYmMyZWFkZjQ5
15
+ YjQwODVlYjhlZTBhZDJjNjBhOThiN2ZiMzg1MWVjNWQzZTMxN2M=
data/lib/case_scraper.rb CHANGED
@@ -2,13 +2,22 @@
2
2
  #
3
3
  # This script uses the Yomu library
4
4
  # to extract the text from a legal
5
- # brief in RTF format. It then uses
6
- # regular expressions to search the
7
- # document for case citations and
8
- # extract the volume, reporter, and
9
- # starting page for each case. The case
10
- # citations are then output to a text
11
- # file, with one citation per line.
5
+ # brief that is in RTF format. The
6
+ # script then applies regular
7
+ # expressions to search the text of
8
+ # the document for case citations.
9
+
10
+ # You have the option of extracting
11
+ # only the volume, reporter, and starting
12
+ # page number for each case, or extracting
13
+ # the entire citation (though this is
14
+ # imperfect) along with the organizational
15
+ # headings from the brief, with the case
16
+ # citations each placed under the
17
+ # appropriate section or subsection. The
18
+ # case citations and heading text are
19
+ # then output to a text file, with one
20
+ # citation or heading per line.
12
21
  #
13
22
  # This is an experimental script. Use
14
23
  # at your own risk and please be sure
@@ -37,16 +46,57 @@
37
46
  # Require the external Yomu library.
38
47
 
39
48
  require 'yomu'
49
+ require 'optparse'
50
+
51
+ # Define options for toggling citation
52
+ # type and inclusion of headings in the
53
+ # brief.
54
+
55
+ options = {}
56
+
57
+ optparse = OptionParser.new do|opts|
58
+ # Help banner.
59
+ opts.banner = "Usage: case_scraper.rb [options] InputFile.rtf OutputFile.txt ..."
60
+
61
+ # Define options.
62
+ options[:fullcite] = false
63
+ opts.on( '-f', '--full', 'Scrape complete case citations with headings.' ) do
64
+ options[:fullcite] = true
65
+ end
66
+
67
+ opts.on( '-h', '--help', 'Display help.' ) do
68
+ puts opts
69
+ exit
70
+ end
71
+ end
72
+
73
+ # Parse the command-line and remove options from ARGV array.
74
+ optparse.parse!
40
75
 
41
76
  # Test that input and output files
42
77
  # are specified.
43
78
 
44
79
  unless ARGV.length == 2
45
80
  puts "You must include an input filename (RTF format) and an output filename (TXT format)."
46
- puts "Usage: ruby MyScript.rb InputFile.rtf OutputFile.txt\n"
81
+ puts "Usage: ruby case_scraper.rb InputFile.rtf OutputFile.txt\n"
47
82
  exit
48
83
  end
49
84
 
85
+ if options[:fullcite]
86
+
87
+ # Regexp for scraping the entire case citation
88
+
89
+ citationregex = "(((([a-z]{3}(\\.|\\?|!)\"?)|see also|see|accord)\\s)|^[A-Z][a-z]*\\s([Vv][\\.]*\\s))[A-Z].{1,125}(\\,|\\s(\\[\\d{4,4}\\])|\\s\\(\\d{4,4}\\)|\\s)\\s\\d{1,4}\\s[A-Za-z0-9\\.\\s]{3,17}\\d{1,8}(\\s?|\\,|\\.)((\\(|\\[)([A-Za-z0-9\\.\\,\\s]{1,20})*\\d{4,4}(\\]|\\))*)*|^\\s*[A-Za-z0-9]{1,4}\\.\\s.{1,150}$"
90
+ else
91
+
92
+ # Regexp for scraping the just the volume,
93
+ # reporter, and first page number.
94
+
95
+ citationregex = "\\d{1,4}\\s\[^\\[\\(]{1,16}\\s\\d{1,7}"
96
+ end
97
+
98
+ # Regexp for scraping the brief headings.
99
+
50
100
  # Set the two command line arguments
51
101
  # as variables.
52
102
 
@@ -82,22 +132,67 @@ else
82
132
  end
83
133
 
84
134
  # Using regular expressions, extract
85
- # the case volume, the reporter
86
- # abbreviation, and the starting page
87
- # number for (hopefully . . . ) each
88
- # full case citation in the brief.
89
- #
135
+ # the citations and headings, per
136
+ # the specified options.
90
137
  # The regular expressions should
91
138
  # catch common variations, including
92
139
  # spacing or lack of spacing in the
93
140
  # reporter abbreviation,
94
141
  # state-specific citation formats for
95
142
  # New York and California, and
96
- # Westlaw and LEXIS citations for
97
- # unpublished rulings.
143
+ # WestLaw and LEXIS citations for
144
+ # unpublished rulings. As you may see
145
+ # if you run the example file, the script
146
+ # is imperfect and may over-select when
147
+ # dealing with short sentences or
148
+ # certain citations at the beginning
149
+ # of a new line.
98
150
 
99
151
  puts "Scraping case citations from #{input}."
100
- text.scan(/\d+\s\i*\w+\.+\s*\w*\.*\s*\w*\.*\s*\w*\.*\s\d+/) { |w| target.write "#{w}\n" }
152
+
153
+ # Alternative processing procedures
154
+ # depending on whether the script is
155
+ # extracting full or partial citations.
156
+ # Slightly more processing is required
157
+ # for full citations because the regexp
158
+ # has a lot of nested captures and the
159
+ # resulting array is more complex.
160
+
161
+ if options[:fullcite]
162
+ allcapture_array = text.scan(/(#{citationregex})/)
163
+
164
+ fullcase_array = allcapture_array.transpose
165
+
166
+ fullcases_only_array = fullcase_array.first
167
+
168
+ fullcases_only_array.flatten
169
+
170
+ fullcases_only_array.each do |a|
171
+ a.gsub!(/^[a-z]{3,3}\."?\s/, "")
172
+ end
173
+
174
+ fullcases_only_array.reject! { |c| c.empty? }
175
+
176
+ puts "Writing case citations to #{output}."
177
+
178
+ File.open(target, "w+") do |f|
179
+ f.puts(fullcases_only_array)
180
+ end
181
+
182
+ else
183
+ allcapture_array = text.scan(/(#{citationregex})/)
184
+ fullcase_array = allcapture_array.transpose
185
+
186
+ fullcases_only_array = fullcase_array.first
187
+
188
+ fullcases_only_array.reject! { |c| c.empty? }
189
+
190
+
191
+ fullcases_only_array.flatten
192
+ File.open(target, "w+") do |f|
193
+ f.puts(fullcases_only_array)
194
+ end
195
+ end
101
196
 
102
197
  puts "Writing case citations to #{output}."
103
198
 
@@ -108,12 +203,19 @@ target.close()
108
203
  # Open the output file in the user's
109
204
  # default text editor.
110
205
  #
111
- # The citation list, one per line, is
112
- # in the appropriate format for
206
+ # If the chosen output is a bare,
207
+ # partial citation list, then the
208
+ # output, with one citation per line,
209
+ # is in the appropriate format for
113
210
  # cutting and pasting directly into
114
211
  # Westlaw "Find & Print." (The
115
212
  # maximum number of lines for a
116
213
  # single query is 99.)
214
+ #
215
+ # For full citations with headings,
216
+ # the output should save you time in
217
+ # structuring your opposition / reply
218
+ # brief.
117
219
 
118
220
  %x{ call #{output} }
119
221
 
metadata CHANGED
@@ -1,31 +1,31 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: case_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Greg Kochansky
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-10-03 00:00:00.000000000 Z
11
+ date: 2013-10-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: yomu
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - '>='
17
+ - - ! '>='
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - '>='
24
+ - - ! '>='
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
- description: Scrape case citations - volume, reporter, and starting page number -
28
- from legal briefs that are in rtf format.
27
+ description: Scrape case citations and section headings from legal briefs in *.rtf
28
+ format.
29
29
  email: greg@greg-k.com
30
30
  executables: []
31
31
  extensions: []
@@ -42,12 +42,12 @@ require_paths:
42
42
  - lib
43
43
  required_ruby_version: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - '>='
45
+ - - ! '>='
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  required_rubygems_version: !ruby/object:Gem::Requirement
49
49
  requirements:
50
- - - '>='
50
+ - - ! '>='
51
51
  - !ruby/object:Gem::Version
52
52
  version: '0'
53
53
  requirements: []
@@ -55,6 +55,5 @@ rubyforge_project:
55
55
  rubygems_version: 2.0.7
56
56
  signing_key:
57
57
  specification_version: 4
58
- summary: Scrape case citations - volume, reporter, and starting page number - from
59
- legal briefs that are in rtf format.
58
+ summary: Scrape case citations and section headings from legal briefs in *.rtf format.
60
59
  test_files: []