parsefile 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 06411fba942946a2f8946a545d75018c3638fe3c
4
- data.tar.gz: 47108b8867072ecffe7d383a991712b95b47d651
3
+ metadata.gz: 8e7561f81277ae170560990c0efd6f1fac6df28d
4
+ data.tar.gz: 0efdf3f4c2c98114be285ac6c7c60d9d728aa0e3
5
5
  SHA512:
6
- metadata.gz: ed1ab41eef012a454b2b6f694b2829c1360627521e1f9964cf3a3d5588e9cf10a2f406627cc0592017b8bf09ff3967e1b92c7cae2b08ea582f7b4321834efce2
7
- data.tar.gz: 469780b671827136b57b43e36870c3584e09f18dcfd332e638f42eec5b7003e8361cdd1ab5cedfec09c122bf91110a294f019713c46987c121a22b397772ac4e
6
+ metadata.gz: 094423025dd474a3aaf7ce4a7c7ee3f1ba7b39946eaee75f25b4e65fe9ea3117fc8fab7a1fd9afd3680a4a722b25eee9bfe6ed92be61be815a1e3f78f39eb6d2
7
+ data.tar.gz: 5f41812876112bfaa81df0fcf2d90842591abe821429af585d8d90702fe0b8822c4885ae6852c979226884ecc5a7fc1832ab072a36f58c8deaf03226ae6205d7
@@ -22,7 +22,10 @@ class ExtractMetadata
22
22
  outhash[:filetype] = get_file_type
23
23
 
24
24
  # Extract file metadata, merge. and return
25
- outhash.merge!(extract_file_metadata)
25
+ begin
26
+ outhash.merge!(extract_file_metadata)
27
+ rescue
28
+ end
26
29
  return outhash
27
30
  end
28
31
 
data/lib/ocrfile.rb CHANGED
@@ -14,8 +14,8 @@ class OCRFile
14
14
  # OCR file
15
15
  def ocr
16
16
  begin
17
- if File.exist?(@output_dir+@rel_path)
18
- load_extracted_text(@output_dir+@rel_path)
17
+ if File.exist?(@output_dir+@rel_path+".json")
18
+ load_extracted_text(@output_dir+@rel_path+".json")
19
19
  elsif @path.include?(".pdf")
20
20
  ocr_pdf
21
21
  else
@@ -24,7 +24,7 @@ class OCRFile
24
24
  rescue # Detect errors
25
25
  binding.pry
26
26
  end
27
-
27
+
28
28
  return @text
29
29
  end
30
30
 
@@ -36,7 +36,7 @@ class OCRFile
36
36
 
37
37
  # Load text that is already extracted
38
38
  def load_extracted_text(file)
39
- @text = File.read(file)
39
+ @text = JSON.parse(File.read(file))["text"]
40
40
  end
41
41
 
42
42
  # Send file to give me text
@@ -55,23 +55,34 @@ class OCRFile
55
55
 
56
56
  # OCR with tesseract
57
57
  def ocr_pdf
58
+ # Dir_paths
59
+ base = Dir.pwd+"/"
60
+
58
61
  # Split pages to handle large PDFs
59
- Docsplit.extract_pages(@path, :output => 'pages')
62
+ Docsplit.extract_pages(@path, :output => base+'pages')
60
63
  filename = @path.split("/").last.gsub(".pdf", "")
61
- docs = Dir['pages/'+filename+'*']
64
+ docs = Dir[base+'pages/'+filename+'*']
62
65
 
66
+ # Rename pages so that they can be processed with spaces
67
+ docs.each do |d|
68
+ new_name = d.split("/").last.gsub(" ", "_").gsub("(", "").gsub(")", "")
69
+ File.rename(d, base+'pages/'+new_name)
70
+ end
71
+ filename = filename.gsub(" ", "_").gsub("(", "").gsub(")", "")
72
+ docs_no_spaces = Dir[base+'pages/'+filename+'*']
73
+
63
74
  # Extract text and save
64
- Docsplit.extract_text(docs, :output => 'text')
65
- text_files = Dir['text/'+filename+'*']
75
+ Docsplit.extract_text(docs_no_spaces, :output => base+'text')
76
+ text_files = Dir[base+'text/'+filename+'*']
66
77
  sorted_text = text_files.sort_by {|f| f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
67
78
  sorted_text.each do |f|
68
79
  @text += File.read(f)
69
80
  end
70
81
 
71
82
  # Clean up
72
- FileUtils.rm_f Dir.glob("pages/*")
73
- Dir.delete("pages")
74
- FileUtils.rm_f Dir.glob("text/*")
75
- Dir.delete("text")
83
+ FileUtils.rm_f Dir.glob(base+"pages/*")
84
+ Dir.delete(base+"pages")
85
+ FileUtils.rm_f Dir.glob(base+"text/*")
86
+ Dir.delete(base+"text")
76
87
  end
77
88
  end
data/lib/parsefile.rb CHANGED
@@ -2,9 +2,9 @@ require 'json'
2
2
  require 'docsplit'
3
3
  require 'fileutils'
4
4
  require 'pry'
5
- require 'dircrawl'
6
- load 'ocrfile.rb'
7
- load 'extractmetadata.rb'
5
+ #require 'dircrawl'
6
+ load '/home/shidash/Code/ParseFile/lib/ocrfile.rb' #FIX
7
+ load '/home/shidash/Code/ParseFile/lib/extractmetadata.rb' # FIX
8
8
 
9
9
  class ParseFile
10
10
  def initialize(file, input_dir, output_dir)
@@ -15,6 +15,7 @@ class ParseFile
15
15
 
16
16
  # Parse the file
17
17
  def parse_file
18
+ begin
18
19
  # Get metadata
19
20
  m = ExtractMetadata.new(@path, @input_dir, @output_dir)
20
21
  @metadata = m.extract
@@ -25,6 +26,9 @@ class ParseFile
25
26
 
26
27
  # Generate output and return
27
28
  gen_output
29
+ rescue #TODO: Fix!
30
+ binding.pry
31
+ end
28
32
  end
29
33
 
30
34
  # Generate output
@@ -32,7 +36,16 @@ class ParseFile
32
36
  outhash = Hash.new
33
37
  outhash[:full_path] = @path
34
38
  outhash.merge!(@metadata)
35
- outhash[:text] = @text
36
- return JSON.pretty_generate(outhash)
39
+ begin
40
+ outhash[:text] = @text.to_s.encode('UTF-8', {
41
+ :invalid => :replace,
42
+ :undef => :replace,
43
+ :replace => '?'
44
+ })
45
+ return JSON.pretty_generate(outhash)
46
+ rescue
47
+ binding.pry
48
+ end
37
49
  end
38
50
  end
51
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parsefile
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-29 00:00:00.000000000 Z
11
+ date: 2016-05-16 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: OCR file and extract metadata
14
14
  email: shidash@shidash.com