parsefile 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 06411fba942946a2f8946a545d75018c3638fe3c
4
- data.tar.gz: 47108b8867072ecffe7d383a991712b95b47d651
3
+ metadata.gz: 8e7561f81277ae170560990c0efd6f1fac6df28d
4
+ data.tar.gz: 0efdf3f4c2c98114be285ac6c7c60d9d728aa0e3
5
5
  SHA512:
6
- metadata.gz: ed1ab41eef012a454b2b6f694b2829c1360627521e1f9964cf3a3d5588e9cf10a2f406627cc0592017b8bf09ff3967e1b92c7cae2b08ea582f7b4321834efce2
7
- data.tar.gz: 469780b671827136b57b43e36870c3584e09f18dcfd332e638f42eec5b7003e8361cdd1ab5cedfec09c122bf91110a294f019713c46987c121a22b397772ac4e
6
+ metadata.gz: 094423025dd474a3aaf7ce4a7c7ee3f1ba7b39946eaee75f25b4e65fe9ea3117fc8fab7a1fd9afd3680a4a722b25eee9bfe6ed92be61be815a1e3f78f39eb6d2
7
+ data.tar.gz: 5f41812876112bfaa81df0fcf2d90842591abe821429af585d8d90702fe0b8822c4885ae6852c979226884ecc5a7fc1832ab072a36f58c8deaf03226ae6205d7
@@ -22,7 +22,10 @@ class ExtractMetadata
22
22
  outhash[:filetype] = get_file_type
23
23
 
24
24
  # Extract file metadata, merge. and return
25
- outhash.merge!(extract_file_metadata)
25
+ begin
26
+ outhash.merge!(extract_file_metadata)
27
+ rescue
28
+ end
26
29
  return outhash
27
30
  end
28
31
 
data/lib/ocrfile.rb CHANGED
@@ -14,8 +14,8 @@ class OCRFile
14
14
  # OCR file
15
15
  def ocr
16
16
  begin
17
- if File.exist?(@output_dir+@rel_path)
18
- load_extracted_text(@output_dir+@rel_path)
17
+ if File.exist?(@output_dir+@rel_path+".json")
18
+ load_extracted_text(@output_dir+@rel_path+".json")
19
19
  elsif @path.include?(".pdf")
20
20
  ocr_pdf
21
21
  else
@@ -24,7 +24,7 @@ class OCRFile
24
24
  rescue # Detect errors
25
25
  binding.pry
26
26
  end
27
-
27
+
28
28
  return @text
29
29
  end
30
30
 
@@ -36,7 +36,7 @@ class OCRFile
36
36
 
37
37
  # Load text that is already extracted
38
38
  def load_extracted_text(file)
39
- @text = File.read(file)
39
+ @text = JSON.parse(File.read(file))["text"]
40
40
  end
41
41
 
42
42
  # Send file to give me text
@@ -55,23 +55,34 @@ class OCRFile
55
55
 
56
56
  # OCR with tesseract
57
57
  def ocr_pdf
58
+ # Dir_paths
59
+ base = Dir.pwd+"/"
60
+
58
61
  # Split pages to handle large PDFs
59
- Docsplit.extract_pages(@path, :output => 'pages')
62
+ Docsplit.extract_pages(@path, :output => base+'pages')
60
63
  filename = @path.split("/").last.gsub(".pdf", "")
61
- docs = Dir['pages/'+filename+'*']
64
+ docs = Dir[base+'pages/'+filename+'*']
62
65
 
66
+ # Rename pages so that they can be processed with spaces
67
+ docs.each do |d|
68
+ new_name = d.split("/").last.gsub(" ", "_").gsub("(", "").gsub(")", "")
69
+ File.rename(d, base+'pages/'+new_name)
70
+ end
71
+ filename = filename.gsub(" ", "_").gsub("(", "").gsub(")", "")
72
+ docs_no_spaces = Dir[base+'pages/'+filename+'*']
73
+
63
74
  # Extract text and save
64
- Docsplit.extract_text(docs, :output => 'text')
65
- text_files = Dir['text/'+filename+'*']
75
+ Docsplit.extract_text(docs_no_spaces, :output => base+'text')
76
+ text_files = Dir[base+'text/'+filename+'*']
66
77
  sorted_text = text_files.sort_by {|f| f.split(filename).last.gsub("_", "").gsub(".txt", "").to_i }
67
78
  sorted_text.each do |f|
68
79
  @text += File.read(f)
69
80
  end
70
81
 
71
82
  # Clean up
72
- FileUtils.rm_f Dir.glob("pages/*")
73
- Dir.delete("pages")
74
- FileUtils.rm_f Dir.glob("text/*")
75
- Dir.delete("text")
83
+ FileUtils.rm_f Dir.glob(base+"pages/*")
84
+ Dir.delete(base+"pages")
85
+ FileUtils.rm_f Dir.glob(base+"text/*")
86
+ Dir.delete(base+"text")
76
87
  end
77
88
  end
data/lib/parsefile.rb CHANGED
@@ -2,9 +2,9 @@ require 'json'
2
2
  require 'docsplit'
3
3
  require 'fileutils'
4
4
  require 'pry'
5
- require 'dircrawl'
6
- load 'ocrfile.rb'
7
- load 'extractmetadata.rb'
5
+ #require 'dircrawl'
6
+ load '/home/shidash/Code/ParseFile/lib/ocrfile.rb' #FIX
7
+ load '/home/shidash/Code/ParseFile/lib/extractmetadata.rb' # FIX
8
8
 
9
9
  class ParseFile
10
10
  def initialize(file, input_dir, output_dir)
@@ -15,6 +15,7 @@ class ParseFile
15
15
 
16
16
  # Parse the file
17
17
  def parse_file
18
+ begin
18
19
  # Get metadata
19
20
  m = ExtractMetadata.new(@path, @input_dir, @output_dir)
20
21
  @metadata = m.extract
@@ -25,6 +26,9 @@ class ParseFile
25
26
 
26
27
  # Generate output and return
27
28
  gen_output
29
+ rescue #TODO: Fix!
30
+ binding.pry
31
+ end
28
32
  end
29
33
 
30
34
  # Generate output
@@ -32,7 +36,16 @@ class ParseFile
32
36
  outhash = Hash.new
33
37
  outhash[:full_path] = @path
34
38
  outhash.merge!(@metadata)
35
- outhash[:text] = @text
36
- return JSON.pretty_generate(outhash)
39
+ begin
40
+ outhash[:text] = @text.to_s.encode('UTF-8', {
41
+ :invalid => :replace,
42
+ :undef => :replace,
43
+ :replace => '?'
44
+ })
45
+ return JSON.pretty_generate(outhash)
46
+ rescue
47
+ binding.pry
48
+ end
37
49
  end
38
50
  end
51
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parsefile
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-29 00:00:00.000000000 Z
11
+ date: 2016-05-16 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: OCR file and extract metadata
14
14
  email: shidash@shidash.com