case_scraper 0.0.4 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +14 -6
  2. data/lib/case_scraper.rb +120 -18
  3. metadata +9 -10
checksums.yaml CHANGED
@@ -1,7 +1,15 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 79b5532947888d98823625fd8d4dce3dad260cab
4
- data.tar.gz: 6d43160afde9bbd261b7f3d1dfca49af3b913496
5
- SHA512:
6
- metadata.gz: 3f0e434c0fd38294e9ee47080483e4df2428e7f640de3c6742798d2649bbbcf73b165e6f52e5b2b8bba0a930dbd8e997594395564e39039f1c2126e82b5cff5b
7
- data.tar.gz: 3625f884841744b71d5f29f3a6b1adfee0b4882c1846116f37953d1ea1fa57dad04e5cd7151ea59d1506f7738270e3b29231883d600e3bedd36538fd6653868c
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ NGQ0ODk4YjUzYTJjNWE0YTU0MGY3ZDAyMjE4ZGY3MTM3MjY1ODE1Ng==
5
+ data.tar.gz: !binary |-
6
+ YTQ4MmMxNDk2MmEwOWE2N2YzOTFhYThlMTQ0NTY0YWRlZTVjYzQ2MQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ ZGFkODBjYTU2NGE0YWU0OTg0OTcxYmVlNWNjNGI4Y2I5ODQwMzI5ZDQ5Yjdj
10
+ MDVjMDkyZWRiZDVhMzAwNWJkYmY1ZjlmZGRhMWZlMjIzYTlmMjk4MWFiNDgz
11
+ OGMyODAyYjM1OTE2ZDk3ZDQ4M2RiMmI2NTllZTNiYjc2MGY2ZWM=
12
+ data.tar.gz: !binary |-
13
+ YTNjYzI1YmVkOWVlMjVmNDMwYTczMDRhODJkZDQzYjUwZTI4N2EwYjBlZDVk
14
+ NTA1NWQ1NzhiZWYzNmJmYmYwMDY0OTMwYzQ1OGVhNzU3OGYwYmMyZWFkZjQ5
15
+ YjQwODVlYjhlZTBhZDJjNjBhOThiN2ZiMzg1MWVjNWQzZTMxN2M=
data/lib/case_scraper.rb CHANGED
@@ -2,13 +2,22 @@
2
2
  #
3
3
  # This script uses the Yomu library
4
4
  # to extract the text from a legal
5
- # brief in RTF format. It then uses
6
- # regular expressions to search the
7
- # document for case citations and
8
- # extract the volume, reporter, and
9
- # starting page for each case. The case
10
- # citations are then output to a text
11
- # file, with one citation per line.
5
+ # brief that is in RTF format. The
6
+ # script then applies regular
7
+ # expressions to search the text of
8
+ # the document for case citations.
9
+
10
+ # You have the option of extracting
11
+ # only the volume, reporter, and starting
12
+ # page number for each case, or extracting
13
+ # the entire citation (though this is
14
+ # imperfect) along with the organizational
15
+ # headings from the brief, with the case
16
+ # citations each placed under the
17
+ # appropriate section or subsection. The
18
+ # case citations and heading text are
19
+ # then output to a text file, with one
20
+ # citation or heading per line.
12
21
  #
13
22
  # This is an experimental script. Use
14
23
  # at your own risk and please be sure
@@ -37,16 +46,57 @@
37
46
  # Require the external Yomu library.
38
47
 
39
48
  require 'yomu'
49
+ require 'optparse'
50
+
51
+ # Define options for toggling citation
52
+ # type and inclusion of headings in the
53
+ # brief.
54
+
55
+ options = {}
56
+
57
+ optparse = OptionParser.new do|opts|
58
+ # Help banner.
59
+ opts.banner = "Usage: case_scraper.rb [options] InputFile.rtf OutputFile.txt ..."
60
+
61
+ # Define options.
62
+ options[:fullcite] = false
63
+ opts.on( '-f', '--full', 'Scrape complete case citations with headings.' ) do
64
+ options[:fullcite] = true
65
+ end
66
+
67
+ opts.on( '-h', '--help', 'Display help.' ) do
68
+ puts opts
69
+ exit
70
+ end
71
+ end
72
+
73
+ # Parse the command-line and remove options from ARGV array.
74
+ optparse.parse!
40
75
 
41
76
  # Test that input and output files
42
77
  # are specified.
43
78
 
44
79
  unless ARGV.length == 2
45
80
  puts "You must include an input filename (RTF format) and an output filename (TXT format)."
46
- puts "Usage: ruby MyScript.rb InputFile.rtf OutputFile.txt\n"
81
+ puts "Usage: ruby case_scraper.rb InputFile.rtf OutputFile.txt\n"
47
82
  exit
48
83
  end
49
84
 
85
+ if options[:fullcite]
86
+
87
+ # Regexp for scraping the entire case citation
88
+
89
+ citationregex = "(((([a-z]{3}(\\.|\\?|!)\"?)|see also|see|accord)\\s)|^[A-Z][a-z]*\\s([Vv][\\.]*\\s))[A-Z].{1,125}(\\,|\\s(\\[\\d{4,4}\\])|\\s\\(\\d{4,4}\\)|\\s)\\s\\d{1,4}\\s[A-Za-z0-9\\.\\s]{3,17}\\d{1,8}(\\s?|\\,|\\.)((\\(|\\[)([A-Za-z0-9\\.\\,\\s]{1,20})*\\d{4,4}(\\]|\\))*)*|^\\s*[A-Za-z0-9]{1,4}\\.\\s.{1,150}$"
90
+ else
91
+
92
+ # Regexp for scraping the just the volume,
93
+ # reporter, and first page number.
94
+
95
+ citationregex = "\\d{1,4}\\s\[^\\[\\(]{1,16}\\s\\d{1,7}"
96
+ end
97
+
98
+ # Regexp for scraping the brief headings.
99
+
50
100
  # Set the two command line arguments
51
101
  # as variables.
52
102
 
@@ -82,22 +132,67 @@ else
82
132
  end
83
133
 
84
134
  # Using regular expressions, extract
85
- # the case volume, the reporter
86
- # abbreviation, and the starting page
87
- # number for (hopefully . . . ) each
88
- # full case citation in the brief.
89
- #
135
+ # the citations and headings, per
136
+ # the specified options.
90
137
  # The regular expressions should
91
138
  # catch common variations, including
92
139
  # spacing or lack of spacing in the
93
140
  # reporter abbreviation,
94
141
  # state-specific citation formats for
95
142
  # New York and California, and
96
- # Westlaw and LEXIS citations for
97
- # unpublished rulings.
143
+ # WestLaw and LEXIS citations for
144
+ # unpublished rulings. As you may see
145
+ # if you run the example file, the script
146
+ # is imperfect and may over-select when
147
+ # dealing with short sentences or
148
+ # certain citations at the beginning
149
+ # of a new line.
98
150
 
99
151
  puts "Scraping case citations from #{input}."
100
- text.scan(/\d+\s\i*\w+\.+\s*\w*\.*\s*\w*\.*\s*\w*\.*\s\d+/) { |w| target.write "#{w}\n" }
152
+
153
+ # Alternative processing procedures
154
+ # depending on whether the script is
155
+ # extracting full or partial citations.
156
+ # Slightly more processing is required
157
+ # for full citations because the regexp
158
+ # has a lot of nested captures and the
159
+ # resulting array is more complex.
160
+
161
+ if options[:fullcite]
162
+ allcapture_array = text.scan(/(#{citationregex})/)
163
+
164
+ fullcase_array = allcapture_array.transpose
165
+
166
+ fullcases_only_array = fullcase_array.first
167
+
168
+ fullcases_only_array.flatten
169
+
170
+ fullcases_only_array.each do |a|
171
+ a.gsub!(/^[a-z]{3,3}\."?\s/, "")
172
+ end
173
+
174
+ fullcases_only_array.reject! { |c| c.empty? }
175
+
176
+ puts "Writing case citations to #{output}."
177
+
178
+ File.open(target, "w+") do |f|
179
+ f.puts(fullcases_only_array)
180
+ end
181
+
182
+ else
183
+ allcapture_array = text.scan(/(#{citationregex})/)
184
+ fullcase_array = allcapture_array.transpose
185
+
186
+ fullcases_only_array = fullcase_array.first
187
+
188
+ fullcases_only_array.reject! { |c| c.empty? }
189
+
190
+
191
+ fullcases_only_array.flatten
192
+ File.open(target, "w+") do |f|
193
+ f.puts(fullcases_only_array)
194
+ end
195
+ end
101
196
 
102
197
  puts "Writing case citations to #{output}."
103
198
 
@@ -108,12 +203,19 @@ target.close()
108
203
  # Open the output file in the user's
109
204
  # default text editor.
110
205
  #
111
- # The citation list, one per line, is
112
- # in the appropriate format for
206
+ # If the chosen output is a bare,
207
+ # partial citation list, then the
208
+ # output, with one citation per line,
209
+ # is in the appropriate format for
113
210
  # cutting and pasting directly into
114
211
  # Westlaw "Find & Print." (The
115
212
  # maximum number of lines for a
116
213
  # single query is 99.)
214
+ #
215
+ # For full citations with headings,
216
+ # the output should save you time in
217
+ # structuring your opposition / reply
218
+ # brief.
117
219
 
118
220
  %x{ call #{output} }
119
221
 
metadata CHANGED
@@ -1,31 +1,31 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: case_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Greg Kochansky
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-10-03 00:00:00.000000000 Z
11
+ date: 2013-10-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: yomu
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - '>='
17
+ - - ! '>='
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - '>='
24
+ - - ! '>='
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
- description: Scrape case citations - volume, reporter, and starting page number -
28
- from legal briefs that are in rtf format.
27
+ description: Scrape case citations and section headings from legal briefs in *.rtf
28
+ format.
29
29
  email: greg@greg-k.com
30
30
  executables: []
31
31
  extensions: []
@@ -42,12 +42,12 @@ require_paths:
42
42
  - lib
43
43
  required_ruby_version: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - '>='
45
+ - - ! '>='
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  required_rubygems_version: !ruby/object:Gem::Requirement
49
49
  requirements:
50
- - - '>='
50
+ - - ! '>='
51
51
  - !ruby/object:Gem::Version
52
52
  version: '0'
53
53
  requirements: []
@@ -55,6 +55,5 @@ rubyforge_project:
55
55
  rubygems_version: 2.0.7
56
56
  signing_key:
57
57
  specification_version: 4
58
- summary: Scrape case citations - volume, reporter, and starting page number - from
59
- legal briefs that are in rtf format.
58
+ summary: Scrape case citations and section headings from legal briefs in *.rtf format.
60
59
  test_files: []