parsefile 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 99b0e344729338584b0af0696cb2e6e6ef18edef
4
- data.tar.gz: 79b40b78a7af582e57492a4daa75fab5db0462b7
3
+ metadata.gz: bf06a478153c7107295b160bc8b2d76e403e7d8f
4
+ data.tar.gz: 4122ff2b451ba656d75893119209a1adefd2d5dc
5
5
  SHA512:
6
- metadata.gz: 2f2683b2aa5ba9b328f3d8b1bbd9b75bcb5cf9f20aa4c434ee64db7b9ac798e842b998e315532b1253f7b955a2d952abcb9f67fcd9311bc4bf76a473a582a765
7
- data.tar.gz: 2dc0d20a1c0eceb636ff89c15c2311a235a23814cae04bf11e41e2b305d0ca7508ce68180d7a441617cb273f0d9158881b8e28e235422ad48ef3886606a72623
6
+ metadata.gz: 0d92b7753ae69345a3a456be7f61982969bde31dfad4dde386fd4906a8ba631610a6a2dadf6ae7b3c841c01f12e6b7627d96528745aaeed11abf9166a7b321b8
7
+ data.tar.gz: 57620b37897bb7b99d523c9deb76948a723f305338a958a646c6e69b35a38ecaa4f3cb848803bbe8652c45f1b83579cdb5c981d9a132fbd88454eda60eaea066
@@ -6,6 +6,10 @@ class ExtractMetadata
6
6
  @path = file
7
7
  @input_dir = input_dir
8
8
  @output_dir = output_dir
9
+ @allowed_extensions = [
10
+ 'pdf', 'doc', 'docbook', 'docx', 'txt', 'rtf', 'md', 'csv', 'xls', 'xlsx',
11
+ 'jpg', 'jpeg', 'png', 'gif', 'svg'
12
+ ]
9
13
  end
10
14
 
11
15
  # Extract metadata
@@ -23,7 +27,11 @@ class ExtractMetadata
23
27
 
24
28
  # Extract file metadata, merge. and return
25
29
  begin
26
- outhash.merge!(extract_file_metadata)
30
+ if (@allowed_extensions.include? outhash[:filetype])
31
+ outhash.merge!(extract_file_metadata)
32
+ else
33
+ puts "skipping ." + outhash[:filetype] + " file"
34
+ end
27
35
  rescue
28
36
  end
29
37
  return outhash
@@ -1,5 +1,4 @@
1
1
  require 'fileutils'
2
- require 'docsplit'
3
2
  require 'curb'
4
3
  require 'mimemagic'
5
4
 
@@ -16,42 +15,32 @@ class OCRFile
16
15
  # OCR file
17
16
  def ocr
18
17
  begin
18
+ mime_magic = MimeMagic.by_path(@path)
19
19
  if File.exist?(@output_dir+@rel_path+".json")
20
20
  load_extracted_text(@output_dir+@rel_path+".json")
21
- #elsif @path.include?(".pdf")
22
- # ocr_pdf
23
21
  else
24
22
  if @tika
25
- give_me_text_local
23
+ give_me_text_local(mime_magic)
26
24
  else
27
- @text = File.read(@path)
28
- # give_me_text
25
+ give_me_text
29
26
  end
30
27
  end
31
- rescue # Detect errors
32
- #binding.pry
28
+ rescue
29
+ # Detect errors
30
+ binding.pry
33
31
  end
34
32
 
35
33
  return @text
36
34
  end
37
35
 
38
- # Check if file is pdf
39
- def is_pdf?
40
- puts "determined: is_pdf"
41
- file_start = File.open(@path, 'r') { |f| f.read(8)}
42
- file_start.match(/\%PDF-\d+\.?\d+/)
43
- end
44
-
45
36
  # Load text that is already extracted
46
37
  def load_extracted_text(file)
47
- puts "file exists: load_extracted_text"
38
+ puts "file already exists"
48
39
  @text = JSON.parse(File.read(file))["text"]
49
40
  end
50
41
 
51
42
  # Send file to give me text
52
43
  def give_me_text
53
- puts "using: give_me_text"
54
-
55
44
  c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
56
45
  c.multipart_form_post = true
57
46
  c.http_post(Curl::PostField.file('file', @path))
@@ -60,17 +49,13 @@ class OCRFile
60
49
  gotten_text_ok?(@text)
61
50
  end
62
51
 
63
- def give_me_text_local
64
- puts "using: give_me_text_local"
52
+ def give_me_text_local(mime_magic)
65
53
  c = Curl::Easy.new(@tika + "/tika")
66
- # TODO: move this mime filtering to a higher global level
67
- mime_magic = MimeMagic.by_path(@path)
68
54
  file_data = File.read(@path)
69
55
  c.headers['Content-Type'] = mime_magic.type
70
56
  c.headers['Accept'] = "text/plain"
71
57
  c.http_put(file_data)
72
58
 
73
- #binding.pry
74
59
  @text = c.body_str
75
60
  gotten_text_ok?(@text)
76
61
  end
@@ -80,38 +65,4 @@ class OCRFile
80
65
  throw :extraction_error if text.include?("java.io.IOException: Stream Closed")
81
66
  end
82
67
 
83
- # OCR with tesseract
84
- def ocr_pdf
85
- puts "using: ocr_pdf"
86
- # Dir_paths
87
- base = Dir.pwd+"/"
88
-
89
- # Split pages to handle large PDFs
90
- Docsplit.extract_pages(@path, :output => base+'pages')
91
- filename = @path.split("/").last.gsub(".pdf", "")
92
- docs = Dir[base+'pages/'+filename+'*']
93
-
94
- # Rename pages so that they can be processed with spaces
95
- docs.each do |d|
96
- new_name = d.split("/").last.gsub(" ", "_").gsub("(", "").gsub(")", "")
97
- File.rename(d, base+'pages/'+new_name)
98
- end
99
- filename = filename.gsub(" ", "_").gsub("(", "").gsub(")", "")
100
- docs_no_spaces = Dir[base+'pages/'+filename+'*']
101
-
102
- # Extract text and save
103
- Docsplit.extract_text(docs_no_spaces, :output => base+'text')
104
- text_files = Dir[base+'text/'+filename+'*']
105
- sorted_text = text_files.sort_by {|f|
106
- f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
107
- sorted_text.each do |f|
108
- @text += File.read(f)
109
- end
110
-
111
- # Clean up
112
- FileUtils.rm_f Dir.glob(base+"pages/*")
113
- Dir.delete(base+"pages")
114
- FileUtils.rm_f Dir.glob(base+"text/*")
115
- Dir.delete(base+"text")
116
- end
117
68
  end
@@ -1,44 +1,40 @@
1
1
  require 'json'
2
- require 'docsplit'
3
2
  require 'fileutils'
4
3
  require 'pry'
5
-
6
- load 'ocrfile.rb'
7
- load 'extractmetadata.rb'
4
+ require 'ocrfile'
5
+ require 'extractmetadata'
8
6
 
9
7
  class ParseFile
10
8
  def initialize(file, input_dir, output_dir, tika)
11
9
  @path = file
12
10
  @input_dir = input_dir
13
11
  @output_dir = output_dir
14
- # Pass the url for a custom (or local) Tika server
15
- # Else use OKFNs service over normal HTTP... ZOMG... O.o
12
+ # Pass URL of a Tika server
16
13
  if tika
17
14
  @tika = tika
15
+ # Use OKFNs service over normal HTTP... ZOMG... O.o
18
16
  else
19
17
  @tika = nil
20
18
  end
21
19
  end
22
20
 
23
- # Parse the file
24
21
  def parse_file
25
22
  begin
26
- # Get metadata
27
- m = ExtractMetadata.new(@path, @input_dir, @output_dir)
28
- @metadata = m.extract
23
+ puts "sending file: " + @path
24
+
25
+ m = ExtractMetadata.new(@path, @input_dir, @output_dir)
26
+ @metadata = m.extract
29
27
 
30
- # OCR File
31
- o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path], @tika)
32
- @text = o.ocr
28
+ o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path], @tika)
29
+ @text = o.ocr
33
30
 
34
- # Generate output and return
35
- gen_output
36
- rescue #TODO: Fix!
31
+ gen_output
32
+ rescue
33
+ #TODO: use a global debug / log
37
34
  binding.pry
38
35
  end
39
36
  end
40
37
 
41
- # Generate output
42
38
  def gen_output
43
39
  outhash = Hash.new
44
40
  outhash[:full_path] = @path
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parsefile
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-03-07 00:00:00.000000000 Z
12
+ date: 2017-05-29 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: OCR file and extract metadata using Apache Tika and Tesseract
15
15
  email: shidash@shidash.com
@@ -40,9 +40,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
40
40
  version: '0'
41
41
  requirements: []
42
42
  rubyforge_project:
43
- rubygems_version: 2.4.8
43
+ rubygems_version: 2.6.11
44
44
  signing_key:
45
45
  specification_version: 4
46
46
  summary: OCR file and extract metadata
47
47
  test_files: []
48
- has_rdoc: