parsefile 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 99b0e344729338584b0af0696cb2e6e6ef18edef
4
- data.tar.gz: 79b40b78a7af582e57492a4daa75fab5db0462b7
3
+ metadata.gz: bf06a478153c7107295b160bc8b2d76e403e7d8f
4
+ data.tar.gz: 4122ff2b451ba656d75893119209a1adefd2d5dc
5
5
  SHA512:
6
- metadata.gz: 2f2683b2aa5ba9b328f3d8b1bbd9b75bcb5cf9f20aa4c434ee64db7b9ac798e842b998e315532b1253f7b955a2d952abcb9f67fcd9311bc4bf76a473a582a765
7
- data.tar.gz: 2dc0d20a1c0eceb636ff89c15c2311a235a23814cae04bf11e41e2b305d0ca7508ce68180d7a441617cb273f0d9158881b8e28e235422ad48ef3886606a72623
6
+ metadata.gz: 0d92b7753ae69345a3a456be7f61982969bde31dfad4dde386fd4906a8ba631610a6a2dadf6ae7b3c841c01f12e6b7627d96528745aaeed11abf9166a7b321b8
7
+ data.tar.gz: 57620b37897bb7b99d523c9deb76948a723f305338a958a646c6e69b35a38ecaa4f3cb848803bbe8652c45f1b83579cdb5c981d9a132fbd88454eda60eaea066
@@ -6,6 +6,10 @@ class ExtractMetadata
6
6
  @path = file
7
7
  @input_dir = input_dir
8
8
  @output_dir = output_dir
9
+ @allowed_extensions = [
10
+ 'pdf', 'doc', 'docbook', 'docx', 'txt', 'rtf', 'md', 'csv', 'xls', 'xlsx',
11
+ 'jpg', 'jpeg', 'png', 'gif', 'svg'
12
+ ]
9
13
  end
10
14
 
11
15
  # Extract metadata
@@ -23,7 +27,11 @@ class ExtractMetadata
23
27
 
24
28
  # Extract file metadata, merge. and return
25
29
  begin
26
- outhash.merge!(extract_file_metadata)
30
+ if (@allowed_extensions.include? outhash[:filetype])
31
+ outhash.merge!(extract_file_metadata)
32
+ else
33
+ puts "skipping ." + outhash[:filetype] + " file"
34
+ end
27
35
  rescue
28
36
  end
29
37
  return outhash
@@ -1,5 +1,4 @@
1
1
  require 'fileutils'
2
- require 'docsplit'
3
2
  require 'curb'
4
3
  require 'mimemagic'
5
4
 
@@ -16,42 +15,32 @@ class OCRFile
16
15
  # OCR file
17
16
  def ocr
18
17
  begin
18
+ mime_magic = MimeMagic.by_path(@path)
19
19
  if File.exist?(@output_dir+@rel_path+".json")
20
20
  load_extracted_text(@output_dir+@rel_path+".json")
21
- #elsif @path.include?(".pdf")
22
- # ocr_pdf
23
21
  else
24
22
  if @tika
25
- give_me_text_local
23
+ give_me_text_local(mime_magic)
26
24
  else
27
- @text = File.read(@path)
28
- # give_me_text
25
+ give_me_text
29
26
  end
30
27
  end
31
- rescue # Detect errors
32
- #binding.pry
28
+ rescue
29
+ # Detect errors
30
+ binding.pry
33
31
  end
34
32
 
35
33
  return @text
36
34
  end
37
35
 
38
- # Check if file is pdf
39
- def is_pdf?
40
- puts "determined: is_pdf"
41
- file_start = File.open(@path, 'r') { |f| f.read(8)}
42
- file_start.match(/\%PDF-\d+\.?\d+/)
43
- end
44
-
45
36
  # Load text that is already extracted
46
37
  def load_extracted_text(file)
47
- puts "file exists: load_extracted_text"
38
+ puts "file already exists"
48
39
  @text = JSON.parse(File.read(file))["text"]
49
40
  end
50
41
 
51
42
  # Send file to give me text
52
43
  def give_me_text
53
- puts "using: give_me_text"
54
-
55
44
  c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
56
45
  c.multipart_form_post = true
57
46
  c.http_post(Curl::PostField.file('file', @path))
@@ -60,17 +49,13 @@ class OCRFile
60
49
  gotten_text_ok?(@text)
61
50
  end
62
51
 
63
- def give_me_text_local
64
- puts "using: give_me_text_local"
52
+ def give_me_text_local(mime_magic)
65
53
  c = Curl::Easy.new(@tika + "/tika")
66
- # TODO: move this mime filtering to a higher global level
67
- mime_magic = MimeMagic.by_path(@path)
68
54
  file_data = File.read(@path)
69
55
  c.headers['Content-Type'] = mime_magic.type
70
56
  c.headers['Accept'] = "text/plain"
71
57
  c.http_put(file_data)
72
58
 
73
- #binding.pry
74
59
  @text = c.body_str
75
60
  gotten_text_ok?(@text)
76
61
  end
@@ -80,38 +65,4 @@ class OCRFile
80
65
  throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
81
66
  end
82
67
 
83
- # OCR with tesseract
84
- def ocr_pdf
85
- puts "using: ocr_pdf"
86
- # Dir_paths
87
- base = Dir.pwd+"/"
88
-
89
- # Split pages to handle large PDFs
90
- Docsplit.extract_pages(@path, :output => base+'pages')
91
- filename = @path.split("/").last.gsub(".pdf", "")
92
- docs = Dir[base+'pages/'+filename+'*']
93
-
94
- # Rename pages so that they can be processed with spaces
95
- docs.each do |d|
96
- new_name = d.split("/").last.gsub(" ", "_").gsub("(", "").gsub(")", "")
97
- File.rename(d, base+'pages/'+new_name)
98
- end
99
- filename = filename.gsub(" ", "_").gsub("(", "").gsub(")", "")
100
- docs_no_spaces = Dir[base+'pages/'+filename+'*']
101
-
102
- # Extract text and save
103
- Docsplit.extract_text(docs_no_spaces, :output => base+'text')
104
- text_files = Dir[base+'text/'+filename+'*']
105
- sorted_text = text_files.sort_by {|f|
106
- f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
107
- sorted_text.each do |f|
108
- @text += File.read(f)
109
- end
110
-
111
- # Clean up
112
- FileUtils.rm_f Dir.glob(base+"pages/*")
113
- Dir.delete(base+"pages")
114
- FileUtils.rm_f Dir.glob(base+"text/*")
115
- Dir.delete(base+"text")
116
- end
117
68
  end
@@ -1,44 +1,40 @@
1
1
  require 'json'
2
- require 'docsplit'
3
2
  require 'fileutils'
4
3
  require 'pry'
5
-
6
- load 'ocrfile.rb'
7
- load 'extractmetadata.rb'
4
+ require 'ocrfile'
5
+ require 'extractmetadata'
8
6
 
9
7
  class ParseFile
10
8
  def initialize(file, input_dir, output_dir, tika)
11
9
  @path = file
12
10
  @input_dir = input_dir
13
11
  @output_dir = output_dir
14
- # Pass the url for a custom (or local) Tika server
15
- # Else use OKFNs service over normal HTTP... ZOMG... O.o
12
+ # Pass URL of a Tika server
16
13
  if tika
17
14
  @tika = tika
15
+ # Use OKFNs service over normal HTTP... ZOMG... O.o
18
16
  else
19
17
  @tika = nil
20
18
  end
21
19
  end
22
20
 
23
- # Parse the file
24
21
  def parse_file
25
22
  begin
26
- # Get metadata
27
- m = ExtractMetadata.new(@path, @input_dir, @output_dir)
28
- @metadata = m.extract
23
+ puts "sending file: " + @path
24
+
25
+ m = ExtractMetadata.new(@path, @input_dir, @output_dir)
26
+ @metadata = m.extract
29
27
 
30
- # OCR File
31
- o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path], @tika)
32
- @text = o.ocr
28
+ o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path], @tika)
29
+ @text = o.ocr
33
30
 
34
- # Generate output and return
35
- gen_output
36
- rescue #TODO: Fix!
31
+ gen_output
32
+ rescue
33
+ #TODO: use a global debug / log
37
34
  binding.pry
38
35
  end
39
36
  end
40
37
 
41
- # Generate output
42
38
  def gen_output
43
39
  outhash = Hash.new
44
40
  outhash[:full_path] = @path
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parsefile
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-03-07 00:00:00.000000000 Z
12
+ date: 2017-05-29 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: OCR file and extract metadata using Apache Tika and Tesseract
15
15
  email: shidash@shidash.com
@@ -40,9 +40,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
40
40
  version: '0'
41
41
  requirements: []
42
42
  rubyforge_project:
43
- rubygems_version: 2.4.8
43
+ rubygems_version: 2.6.11
44
44
  signing_key:
45
45
  specification_version: 4
46
46
  summary: OCR file and extract metadata
47
47
  test_files: []
48
- has_rdoc: