RubyGems - docx-cloner - Versions diffs - 0.0.1 → 0.1.0 - Mend

docx-cloner 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +8 -8
data/.gitignore +68 -0
data/.rspec +2 -0
data/Gemfile +4 -1
data/docx-examples/read-single-tags-body.xml +1383 -0
data/docx-examples/read-single-tags.docx +0 -0
data/docx-examples/source.docx +0 -0
data/docx-examples/wp.xml +52 -0
data/features/read.feature +46 -0
data/features/replace.feature +64 -0
data/features/steps_define/steps.rb +92 -0
data/lib/docx/cloner.rb +251 -1
data/lib/docx/cloner/version.rb +1 -1
data/spec/cloner_spec.rb +101 -0
data/spec/spec_helper.rb +3 -0
metadata +18 -3

data/docx-examples/read-single-tags.docx ADDED

Binary file

data/docx-examples/source.docx ADDED

Binary file

data/docx-examples/wp.xml ADDED

@@ -0,0 +1,52 @@
+        <w:p w14:paraId="4E9418BB" w14:textId="23E8C963" w:rsidR="00342A6D" w:rsidRDefault="00342A6D" w:rsidP="006A0A53">
+          <w:pPr>
+            <w:pStyle w:val="ab"/>
+            <w:spacing w:after="120"/>
+            <w:rPr>
+              <w:rFonts w:hint="eastAsia"/>
+              <w:lang w:eastAsia="zh-CN"/>
+            </w:rPr>
+          </w:pPr>
+          <w:r>
+            <w:rPr>
+              <w:rFonts w:hint="eastAsia"/>
+              <w:lang w:eastAsia="zh-CN"/>
+            </w:rPr>
+            <w:t>这是一个单词</w:t>
+          </w:r>
+          <w:r>
+            <w:rPr>
+              <w:rFonts w:hint="eastAsia"/>
+              <w:lang w:eastAsia="zh-CN"/>
+            </w:rPr>
+            <w:t xml:space="preserve"> </w:t>
+          </w:r>
+          <w:r w:rsidR="000F595B">
+            <w:rPr>
+              <w:rFonts w:hint="eastAsia"/>
+              <w:lang w:eastAsia="zh-CN"/>
+            </w:rPr>
+            <w:t>{</w:t>
+          </w:r>
+          <w:r>
+            <w:rPr>
+              <w:rFonts w:hint="eastAsia"/>
+              <w:lang w:eastAsia="zh-CN"/>
+            </w:rPr>
+            <w:t>n</w:t>
+          </w:r>
+          <w:r w:rsidR="000F595B">
+            <w:rPr>
+              <w:rFonts w:hint="eastAsia"/>
+              <w:lang w:eastAsia="zh-CN"/>
+            </w:rPr>
+            <w:t>ame}</w:t>
+          </w:r>
+          <w:r>
+            <w:rPr>
+              <w:rFonts w:hint="eastAsia"/>
+              <w:lang w:eastAsia="zh-CN"/>
+            </w:rPr>
+            <w:t>测试</w:t>
+          </w:r>
+        </w:p>

data/features/read.feature ADDED

@@ -0,0 +1,46 @@
+#language: zh-CN
+功能: 读Docx内标签定义
+  这里要确认标签读取的正确性，然后再进入替换阶段
+  1、主要解决的问题包括：将docx文件拆包、找到对应的文件位置
+  2、xml标记可能是散开的，例如"{name}"在docx文件内部表示中，"{"、"name"、"}"是各自独立的xml标记
+  3、替换逻辑，希望使用DSL在程序中指定，因此不应该限定到底使用"{name}"还是"$name$"做标签标识
+  背景: 可读的示例文件列举
+    假如"docx-examples"示例文件夹中存在一个"read-single-tags.docx"的文件
+  场景大纲: 简单地读取词语替换标签
+    这是最简单的情形，例如将标签{name}，替换为真正的姓名。
+    那么程序应该能读到"<tagname>"这个标签词
+    例子: 读取标签的例子
+      "{}"可作为默认的正则表达式设计，在DSL中无需指定
+      程序应该支持中文（以及其它UTF8字符）
+      | tagname |
+      | name    |
+      | {name}  |
+      | {Name}  |
+      | {NAME}  |
+      | {{名字}} |
+      | $名字$   |
+  @wip
+  场景大纲: 读取表格行替换标签
+    这通常是在表格上追加行所使用的
+    那么程序应该能读到"<tagname>"这个标签词
+    例子:
+      | tagname |
+      | {名称1}  |
+      | {名称2}  |
+      | {00.01} |
+      | {00.02} |
+  场景: 读取文档信息标签
+    包括标题、摘要、作者、邮件等设置信息
+  场景: 读取图像标签
+    这是做图像替换时使用的

data/features/replace.feature ADDED

@@ -0,0 +1,64 @@
+#language: zh-CN
+功能: 替换Docx内标签
+  将docx文档中的标签替换为指定的内容。
+  替换的情形有很多，大致包括：
+    1、单个标签替换，如"{name}"替换为"周大福"
+    2、多个标签同时替换
+    3、列表标签替换，如表格中包含一行定义，每行包括"{价格}"和"{数量}"，而要替换的数据是不确定的，如有5行，也可能是50行
+       但所替换的数据都使用标签所在的行样式
+    4、表格中可能包含一些复杂的情况，例如行样式包括按奇数行、偶数行的不同样式
+    5、docx文件也可能对List列表作为整行的样式复制
+    6、更复杂的情况是图表、图片等情况
+    7、还有页眉、页脚中的内容替换
+  背景: 被替换的源文件
+    假如"docx-examples"示例文件夹中存在一个"source.docx"的文件
+    而且"docx-examples/dest.docx"这个目标文件已经被清除
+  场景大纲: 1、简单地读取词语替换标签
+    这是最简单的情形，例如将标签{name}，替换为真正的姓名。
+    假如程序将目标文件中的"<tagname>"替换为"<value>"
+    那么应该生成目标文件
+    而且被目标文件中应该包含"<value>"这个标签词
+    例子: 替换单个标签的几种情况
+      | tagname | value |
+      | {name}  | 周大福 |
+      | {Name}  | 周大福 |
+      | {NAME}  | 周大福 |
+      | {{名字}} | 周大福 |
+      | $名字$   | 周大福 |
+  场景: 2、设置多个标签的情形
+    如果同时替换5个标签的，也要能正确运行
+    假如有这样一组数据：
+      | {name}  | 周大福 |
+      | {Name}  | 周二福 |
+      | {NAME}  | 周三福 |
+      | {{名字}} | 周四福 |
+      | $名字$   | 周五福 |
+    当程序将源文件的第1列中标签替换为第2列数据
+    那么应该生成目标文件
+    而且被目标文件中应该包含被替换的第2列数据
+  @wip
+  场景: 3、替换表格行数据
+    按行数据替换表格内容是常见的应用
+    假如有这样一组数据：
+      | {名称1} | {00.01}   |
+      | 自行车  | 256.00    |
+      | 小汽车  | 125600    |
+      | 大卡车  | 256000.00 |
+      | 电视机  | 6999.00   |
+      | 洗衣机  | 3488.00   |
+    当程序将表中第1行作为标签名，第2行以后作为行数据替换
+    那么应该生成目标文件
+    而且被目标文件中应该包含被替换的第2行以后的数据

data/features/steps_define/steps.rb ADDED

@@ -0,0 +1,92 @@
+#encoding: utf-8
+lib = File.expand_path('../../../lib', __FILE__)
+require "#{lib}/docx/cloner"
+#require 'fileutils'
+假如(/^"(.*?)"示例文件夹中存在一个"(.*?)"的文件$/) do |folder, file|
+  @source_filename = File.expand_path "#{folder}/#{file}"
+  File.exists?(@source_filename).should be_true
+end
+那么(/^程序应该能读到"(.*?)"这个标签词$/) do |tag_name|
+  docx = Docx::Cloner::DocxTool.new @source_filename
+  result = docx.include_single_tag? tag_name
+  docx.release
+  result.should be_true
+end
+假如(/^"(.*?)"这个目标文件已经被清除$/) do |dest|
+  @dest_filename = dest
+  File.delete @dest_filename if File.exist?(dest)
+  File.exist?(dest).should be_false
+end
+假如(/^程序将目标文件中的"(.*?)"替换为"(.*?)"$/) do |tag, value|
+  docx = Docx::Cloner::DocxTool.new @source_filename
+  result = docx.set_single_tag tag, value
+  docx.save @dest_filename
+  docx.release
+  result.should be_true
+end
+那么(/^应该生成目标文件$/) do
+  File.exist?(@dest_filename).should be_true
+end
+而且(/^被目标文件中应该包含"(.*?)"这个标签词$/) do |value|
+  docx = Docx::Cloner::DocxTool.new @dest_filename
+  result = docx.include_single_tag? value
+  docx.release
+  result.should be_true
+end
+假如(/^有这样一组数据：$/) do |table|
+  @data = table.raw
+end
+当(/^程序将源文件的第1列中标签替换为第2列数据$/) do
+  result = true
+  docx = Docx::Cloner::DocxTool.new @source_filename
+  @data.each do |row|
+    result &= docx.set_single_tag row[0], row[1]
+  end
+  docx.save @dest_filename
+  docx.release
+  result.should be_true
+end
+那么(/^被目标文件中应该包含被替换的第2列数据$/) do
+  result = true
+  docx = Docx::Cloner::DocxTool.new @dest_filename
+  @data.each do |row|
+    result &= docx.include_single_tag? row[1]
+  end
+  docx.release
+  result.should be_true
+end
+当(/^程序将表中第1行作为标签名，第2行以后作为行数据替换$/) do
+  docx = Docx::Cloner::DocxTool.new @source_filename
+  #先设置行标签的复制范围和类型
+  #再逐行克隆表数据
+  #yield块结束后清除标签
+  result = docx.set_row_tags @data.first, @data[1..-1], 'tr'
+  docx.save @dest_filename
+  docx.release
+  result.should be_true
+end
+那么(/^被目标文件中应该包含被替换的第2行以后的数据$/) do
+  result = true
+  docx = Docx::Cloner::DocxTool.new @dest_filename
+  @data[1..-1].each do |row|
+    row.each do |value|
+      result &= docx.include_single_tag? value
+    end
+  end
+  docx.release
+  result.should be_true
+end

data/lib/docx/cloner.rb CHANGED

@@ -1,7 +1,257 @@
+#encoding: utf-8
 require "docx/cloner/version"
+require 'zip/zip'  #rubyzip gem
+require 'nokogiri'
 module Docx
   module Cloner
-    # Your code goes here...
+    class WordXmlFile
+      def self.open(path, &block)
+        self.new(path, &block)
+      end
+      def initialize(path, &block)
+        @replace = {}
+        if block_given?
+          @zip = Zip::ZipFile.open(path)
+          yield self
+          @zip.close
+        else
+          @zip = Zip::ZipFile.open(path)
+        end
+      end
+      def merge(rec)
+        _xml = @zip.read("word/document.xml")
+        doc = Nokogiri::XML(_xml)
+        tags = doc.root.xpath("//w:t[contains(., '_Name')]")
+        tags.each do |field|
+          new_field = field
+          if field.content == 'First_Name'
+            field.inner_html = 'Adi'
+            new_field.inner_html = 'My Adi'
+            field.add_next_sibling(new_field.to_html)
+          elsif field.content == 'Last_Name'
+            field.inner_html = 'Zhou'
+          end
+        end
+        @replace["word/document.xml"] = doc.serialize :save_with => 0
+      end
+      def save(path)
+        Zip::ZipFile.open(path, Zip::ZipFile::CREATE) do |out|
+          @zip.each do |entry|
+            out.get_output_stream(entry.name) do |o|
+              if @replace[entry.name]
+                o.write(@replace[entry.name])
+              else
+                o.write(@zip.read(entry.name))
+              end
+            end
+          end
+        end
+        @zip.close
+      end
+    end
+    class DocxTool
+      '加载docx文件，将段落存储到@paragraph，用@paragraph[:text_content]检索，再从段落内检索xml标签位置'
+      def initialize(file)
+        @zip = Zip::ZipFile.open(file)
+        _xml = @zip.read("word/document.xml")
+        @doc = Nokogiri::XML(_xml)
+        @global_paragraph = generate_paragraph @doc
+        @replace = {}
+        #puts @paragraph
+      end
+      def release
+        @zip.close
+      end
+      def save(path)
+        @replace["word/document.xml"] = @doc.serialize :save_with => 0
+        Zip::ZipFile.open(path, Zip::ZipFile::CREATE) do |out|
+          @zip.each do |entry|
+            out.get_output_stream(entry.name) do |o|
+              if @replace[entry.name]
+                o.write(@replace[entry.name])
+              else
+                o.write(@zip.read(entry.name))
+              end
+            end
+          end
+        end
+      end
+      def include_single_tag?(tag)
+        @global_paragraph.each do |p|
+          if p[:text_content].include? tag
+            return true
+          end
+        end
+        return false
+      end
+      def read_single_tag_xml(tag)
+        @global_paragraph.each do |p|
+          if p[:text_content].include? tag
+            from = p[:text_content].index tag
+            to = from + tag.size - 1
+            #puts "from:#{from}, to:#{to}"
+            pos = 0
+            dest = ""
+            p[:text_run].each do |wt|
+              #puts "pos:#{pos}"
+              if pos >= from && pos < to
+                dest << wt.parent.to_xml << "\n"
+              end
+              if pos >= to
+                return dest
+              end
+              pos += wt.content.size
+            end
+            return dest
+          end
+        end
+        return ''
+      end
+      #替换单个标签为指定值
+      def set_single_tag tag, value
+        replace_tag tag, value
+      end
+      #获取标签所在的范围，例如表格的行
+      #简单的考虑，则tags中第一个标签位置即可确定为scope位置
+      #复杂的考虑，则可根据tags中所有标签的共同根（如<w:tr>）确定scope位置，这种情况将允许标签名拥有自己的作用域
+      #这里仅做简单的考虑
+      def get_tag_scope tag, type
+        @global_paragraph.each do |p|
+          if p[:text_content].include? tag #这里是简单的考虑，即使行内标签也必须全局唯一
+            node = p[:text_run].first
+            while true
+              return unless node                      #查找父节点失败
+              return node if node.node_name == type   #查找到匹配的父节点
+              node = node.parent
+            end
+          end
+        end
+        return false
+      end
+      def generate_paragraph node
+        paragraphs = []
+        puts "查找范围：#{node.path}"
+        wp_set = node.xpath(".//w:p")
+        #puts "#{wp_set.size}'s wp"
+        wp_set.each do |wp|
+          p = {text_content: '', text_run: []}
+          wp.xpath(".//w:t").each do |t|
+            p[:text_content] << t.content
+            p[:text_run] << t
+            #puts "node name: #{t.node_name}" if t.content.size > 0
+            #puts t.path
+          end
+          paragraphs << p
+          #puts p[:text_content].include? '$名字$'
+        end
+        return paragraphs
+      end
+      #在指定的范围内替换标签
+      def replace_tag tag, value, node=nil
+        paragraphs = node ? generate_paragraph(node) : @global_paragraph
+        #puts paragraphs
+        paragraphs.each do |p|
+          #puts p[:text_content]
+          if p[:text_content].include? tag
+            from = p[:text_content].index tag
+            to = from + tag.size - 1
+            #puts "tag:#{tag} | from:#{from}, to:#{to} >> #{p[:text_content]}"
+            pos = 0
+            dest = []
+            #puts p[:text_run]
+            p[:text_run].each do |wt|
+              #puts "pos:#{pos}"
+              #通常情况下，msword会把标签拆分成多个xml标签，如'{name}'被拆分成'<wt>{</wt>'和'<wt>name}</wt>'
+              #这可能跟编辑器有关，在处理中文时，这是一种常见的情形
+              if pos+1 >= from && pos <= to #通过pos+1修正临界点问题
+                dest << wt
+              end
+              if pos > to
+                break
+              end
+              pos += wt.content.size
+              #这里要处理一下标签没有被拆分的情形，而是作为纯文本被包含在某个标签中
+              #例如'{name}'包含在'<wt>my {name}</wt>'中
+              #puts "pos:#{pos}, to:#{to}, dest.size:#{dest.size}"
+              #puts wt
+              if pos >= to && dest.size == 0
+                #puts "simple_type | pos:#{pos}, to:#{to} >> #{wt.content}"
+                wt.inner_html = wt.content.sub(tag, value)
+                return true #如果是这种简单情形，就不再需要后续处理了
+              end
+            end
+            if dest.size > 0
+              puts "被替换节点：#{dest.first.path}"
+              dest.first.content = value
+              dest[1..-1].each do |node|
+                #puts node
+                node.remove
+              end
+              #puts "\n"
+              return true
+            else
+              return false
+            end
+          end
+        end
+        return false
+      end
+      #clone标签所在的范围，例如表格的行
+      #返回一组新的行对象集合
+      def clone_tag_scope node, times
+        #puts "clone #{node.node_name} #{times} times"
+        nodes = Array.new times
+        puts "被克隆节点：#{node.path}"
+        times.downto(1).each do |_i|
+          i = _i.to_i - 1
+          nodes[i] = node.dup
+          node.add_next_sibling nodes[i]
+          puts "第#{i+1}个节点克隆：#{nodes[i].path}"
+        end
+        return nodes
+      end
+      #根据行标签设置，替换成多行数据，这里考虑表格的一般情况
+      def set_row_tags tags, values, type
+        puts "tags:#{tags}, values:#{values}, type:#{type}"
+        #找到标签所在行的父节点
+        tag_scope_node = get_tag_scope tags.first, type
+        value_scope_nodes = clone_tag_scope tag_scope_node, values.size
+        value_scope_nodes.each_with_index do |node, r|
+          puts "查找范围：#{node.path}"
+          tags.each_with_index do |tag, c|
+            replace_tag tag, values[r][c], node
+          end
+        end
+        #清除标签
+        tag_scope_node.remove
+        return true
+      end
+    end
   end
 end