konjac 0.1 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/lib/konjac/version.rb +1 -1
  2. data/lib/konjac/word.rb +89 -7
  3. metadata +15 -15
@@ -1,3 +1,3 @@
1
1
  module Konjac
2
- VERSION = "0.1"
2
+ VERSION = "0.1.1"
3
3
  end
data/lib/konjac/word.rb CHANGED
@@ -46,26 +46,108 @@ module Konjac
46
46
  sub_files = Dir.glob(File.expand_path(file))
47
47
  sub_files.each do |sub_file|
48
48
  # Build a list of all the paths we're working with
49
- dirname = File.dirname(sub_file)
50
- basename = File.basename(sub_file, ".*")
51
- xml_path = "#{dirname}/#{basename}.xml"
52
- tags_path = "#{dirname}/#{basename}.tags"
49
+ dirname = File.dirname(sub_file)
50
+ basename = File.basename(sub_file, ".*")
51
+ xml_path = "#{dirname}/#{basename}_orig.xml"
52
+ clean_path = "#{dirname}/#{basename}.xml"
53
+ tags_path = "#{dirname}/#{basename}.tags"
53
54
 
54
55
  # Unzip the DOCX's word/document.xml file and pipe the output into
55
56
  # an XML with the same base name as the DOCX
56
57
  system "unzip -p #{sub_file} word/document.xml > #{xml_path}"
57
58
 
58
59
  # Read in the XML file and extract the content from each <w:t> tag
59
- reader = Nokogiri::XML(File.read(xml_path))
60
+ cleaner = Nokogiri::XML(File.read(xml_path))
60
61
  File.open(tags_path, "w") do |tags_file|
61
- reader.xpath("//w:t").each do |node|
62
+ # Remove all grammar and spellcheck tags
63
+ cleaner.xpath("//w:proofErr").remove
64
+
65
+ nodes = cleaner.xpath("//w:r")
66
+ prev = nil
67
+ nodes.each do |node|
68
+ unless prev.nil?
69
+ if (prev.next_sibling == node) && compare_nodes(prev, node)
70
+ begin
71
+ node.at_xpath("w:t").content = prev.at_xpath("w:t").content +
72
+ node.at_xpath("w:t").content
73
+ prev.remove
74
+ rescue
75
+ end
76
+ end
77
+ end
78
+
79
+ prev = node
80
+ end
81
+
82
+ cleaner.xpath("//w:t").each do |node|
62
83
  tags_file.puts node.content
63
84
  end
64
85
  end
86
+
87
+ File.open(clean_path, "w") do |xml|
88
+ xml.puts cleaner.to_xml
89
+ end
90
+ end
91
+ end
92
+ end
93
+
94
+ private
95
+
96
+ # Performs a comparison between two nodes and accepts them as equivalent
97
+ # if the differences are very minor
98
+ def compare_nodes(a, b)
99
+ c = clean_hash(xml_node_to_hash(a))
100
+ d = clean_hash(xml_node_to_hash(b))
101
+ c == d
102
+ end
103
+
104
+ def xml_node_to_hash(node)
105
+ # If we are at the root of the document, start the hash
106
+ if node.element?
107
+ result_hash = {}
108
+ if node.attributes != {}
109
+ result_hash[:attributes] = {}
110
+ node.attributes.keys.each do |key|
111
+ result_hash[:attributes][node.attributes[key].name.to_sym] = prepare(node.attributes[key].value)
112
+ end
113
+ end
114
+ if node.children.size > 0
115
+ node.children.each do |child|
116
+ result = xml_node_to_hash(child)
117
+
118
+ if child.name == "text"
119
+ unless child.next_sibling || child.previous_sibling
120
+ return prepare(result)
121
+ end
122
+ elsif result_hash[child.name.to_sym]
123
+ if result_hash[child.name.to_sym].is_a?(Array)
124
+ result_hash[child.name.to_sym] << prepare(result)
125
+ else
126
+ result_hash[child.name.to_sym] = [result_hash[child.name.to_sym]] << prepare(result)
127
+ end
128
+ else
129
+ result_hash[child.name.to_sym] = prepare(result)
130
+ end
131
+ end
132
+
133
+ return result_hash
134
+ else
135
+ return result_hash
65
136
  end
137
+ else
138
+ return prepare(node.content.to_s)
66
139
  end
67
140
  end
141
+
142
+ def prepare(data)
143
+ (data.class == String && data.to_i.to_s == data) ? data.to_i : data
144
+ end
145
+
146
+ # Delete extraneous attributes for comparison
147
+ def clean_hash(hash)
148
+ hash.delete :t
149
+ hash[:rPr][:rFonts][:attributes].delete :hint
150
+ end
68
151
  end
69
152
  end
70
153
  end
71
- # lol
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: konjac
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: 0.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-01-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &70261374910800 !ruby/object:Gem::Requirement
16
+ requirement: &70314541350300 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70261374910800
24
+ version_requirements: *70314541350300
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: bundler
27
- requirement: &70261374908420 !ruby/object:Gem::Requirement
27
+ requirement: &70314541347900 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *70261374908420
35
+ version_requirements: *70314541347900
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: rspec
38
- requirement: &70261374907000 !ruby/object:Gem::Requirement
38
+ requirement: &70314541345520 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *70261374907000
46
+ version_requirements: *70314541345520
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: sdoc
49
- requirement: &70261374906120 !ruby/object:Gem::Requirement
49
+ requirement: &70314541340780 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *70261374906120
57
+ version_requirements: *70314541340780
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: autotest
60
- requirement: &70261374905300 !ruby/object:Gem::Requirement
60
+ requirement: &70314541332480 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *70261374905300
68
+ version_requirements: *70314541332480
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: autotest-fsevent
71
- requirement: &70261374904580 !ruby/object:Gem::Requirement
71
+ requirement: &70314541323420 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *70261374904580
79
+ version_requirements: *70314541323420
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: autotest-growl
82
- requirement: &70261374903400 !ruby/object:Gem::Requirement
82
+ requirement: &70314541318360 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,7 +87,7 @@ dependencies:
87
87
  version: '0'
88
88
  type: :development
89
89
  prerelease: false
90
- version_requirements: *70261374903400
90
+ version_requirements: *70314541318360
91
91
  description: A Ruby command-line utility for translating files using a YAML wordlist
92
92
  email:
93
93
  - bryan.mckelvey@gmail.com