tag_remover_nokogiri 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0ee5b1c651af34648ff509cd22bc7d92bd19d45d
4
+ data.tar.gz: 358514f7c1c35ed2e493b5dc87e53d24cfe4cf66
5
+ SHA512:
6
+ metadata.gz: a851331a3a07215fd67e0f2eacc42764f59fcfa6310881835d01411fd633c12ecf08261dad528d905c65b89a4aa13aeb5972ca2d6624c5d9052a8fcdecbb72fb
7
+ data.tar.gz: 51b2801326b831db6c44d3a43b3d45c96b3667d36af8ccaf3a381dfed7bb56aca050a1e279a12ce11ef9fd1de377b6b63420f3397288a12113c0b00831c8838d
data/.gitignore ADDED
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ *.gem
15
+ mkmf.log
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in tag_remover.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Daniel Smith
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,49 @@
1
+ # TagRemover (Nokogiri edition)
2
+
3
+ Tag remover let's you remove all elements of specified tags from extremely large XML documents without parsing or loading the whole thing in memory, useful for processing unreasonably large documents without making your server fall over.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'tag_remover'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install tag_remover
20
+
21
+ ## Usage
22
+
23
+ The following line will read XML from `input_stream`, and write it out to `output_stream` with all `div` and `img` elements removed.
24
+
25
+ ```ruby
26
+ TagRemover.process input_stream, output_stream, remove_tags: ['div', 'img']
27
+ ```
28
+
29
+ Options include:
30
+
31
+ * `remove_tags`: List of tags to remove from the XML file.
32
+ * `close_streams`: (`true`|`false`) If set, TagRemover will close `input_stream` and `output_stream` once the proccess is over.
33
+ * [NOT IMPLEMENTED] `format`: (`true`|`false`) If set, then the contents of `output_stream` will be formatted.
34
+
35
+ TagRemover can be used from the command line with the `rmtags` command. The following is an example that reads input.xml and writes the output to output.xml, removing all `div` and `img` elements:
36
+
37
+ $ rmtags input.xml output.xml div img
38
+
39
+ ## Limitations
40
+
41
+ Tag remover currently only works correctly if the XML is formatted with only one tag per line.
42
+
43
+ ## Contributing
44
+
45
+ 1. Fork it ( https://github.com/[my-github-username]/tag_remover/fork )
46
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
47
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
48
+ 4. Push to the branch (`git push origin my-new-feature`)
49
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/bin/rmtags ADDED
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'tag_remover'
4
+
5
+ if ARGV.length < 3
6
+ puts "Usage: rmtags INPUT_FILE OUTPUT_FILE TAG [TAG, TAG, ...]"
7
+ exit 1
8
+ end
9
+
10
+ input_filename = ARGV[0] || 'input.xml'
11
+ output_filename = ARGV[1] || 'output.xml'
12
+
13
+ tags_to_remove = ARGV[2..-1]
14
+
15
+ input_file = File.open input_filename, 'r'
16
+ output_file = File.open output_filename, 'w'
17
+
18
+ TagRemover.process input_file, output_file,
19
+ remove_tags: tags_to_remove, close_streams: true
20
+
@@ -0,0 +1,128 @@
1
+ require "tag_remover/version"
2
+ require "nokogiri"
3
+
4
+ module TagRemover
5
+
6
+ class Worker
7
+ def initialize input, output, opts
8
+ @input = input
9
+ @output = output
10
+ @opts = opts
11
+ end
12
+
13
+ def perform
14
+ @tags_to_remove = @opts[:remove_tags] || []
15
+
16
+ @reader = Nokogiri::XML::Reader @input
17
+
18
+ @in_tag = nil
19
+ @depth = 0
20
+
21
+ @reader.each do |node|
22
+ process_node node
23
+ end
24
+
25
+ if @opts[:close_streams]
26
+ @input.close
27
+ @output.close
28
+ end
29
+ end
30
+
31
+ private
32
+ def process_node node
33
+ if @in_tag
34
+ in_tag_str = @tags_to_remove[@in_tag]
35
+
36
+ if opening_tag? node, in_tag_str
37
+ @depth += 1
38
+ elsif closing_tag? node, in_tag_str
39
+ @depth -= 1
40
+ @in_tag = nil if @depth == 0
41
+ end
42
+ else
43
+ found_tag = false
44
+ @tags_to_remove.each_with_index do |tag_str,index|
45
+ if opening_tag? node, tag_str
46
+ @in_tag = index
47
+ @depth = 1
48
+ found_tag = true
49
+ elsif single_tag? node, tag_str
50
+ found_tag = true
51
+ end
52
+ break if found_tag
53
+ end
54
+ @output.write stringify(node) unless found_tag
55
+ end
56
+ end
57
+
58
+ def opening_tag? tag, match = nil
59
+ return (tag.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT) && !tag.self_closing? && match?(tag, match)
60
+ end
61
+
62
+ def closing_tag? tag, match = nil
63
+ return (tag.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT) && match?(tag, match)
64
+ end
65
+
66
+ def single_tag? tag, match = nil
67
+ return tag.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT && tag.self_closing? && match?(tag, match)
68
+ end
69
+
70
+ def match? tag, match = nil
71
+ (match.nil? || tag.name == match)
72
+ end
73
+
74
+ def stringify tag
75
+ if opening_tag? tag
76
+ "<#{with_attrs tag}>\n"
77
+ elsif closing_tag? tag
78
+ "</#{tag.name}>\n"
79
+ elsif single_tag? tag
80
+ "<#{with_attrs tag}/>\n"
81
+ elsif tag.value?
82
+ s = tag.value.strip
83
+ s.empty? ? "" : "#{s}\n"
84
+ else
85
+ "#{tag.to_s}\n"
86
+ end
87
+ end
88
+
89
+ def with_attrs tag
90
+ return tag.name unless tag.attributes?
91
+ attrs = tag.attributes.map { |k,v| "#{k}=\"#{v}\"" }.join(' ')
92
+ "#{tag.name} #{attrs} "
93
+ end
94
+
95
+ def process_tag tag, type
96
+ if @in_tag
97
+ in_tag_str = @tags_to_remove[@in_tag]
98
+
99
+ if tag =~ opening_tag(in_tag_str)
100
+ @depth += 1
101
+ elsif tag =~ closing_tag(in_tag_str)
102
+ @depth -= 1
103
+ @in_tag = nil if @depth == 0
104
+ end
105
+ else
106
+ found_tag = false
107
+ @tags_to_remove.each_with_index do |tag_str,index|
108
+ if tag =~ opening_tag(tag_str)
109
+ @in_tag = index
110
+ @depth = 1
111
+ found_tag = true
112
+ break
113
+ elsif tag =~ single_tag(tag_str)
114
+ found_tag = true
115
+ break
116
+ end
117
+ end
118
+
119
+ @output.write "#{tag}\n" unless found_tag || tag.empty?
120
+ end
121
+ end
122
+ end
123
+
124
+ def self.process input, output, opts = {}
125
+ Worker.new(input, output, opts).perform
126
+ end
127
+
128
+ end
@@ -0,0 +1,3 @@
1
+ module TagRemover
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,3 @@
1
+ require 'pry'
2
+ require 'tag_remover'
3
+
@@ -0,0 +1,141 @@
1
+ require 'spec_helper'
2
+
3
+ describe TagRemover do
4
+ describe ".process" do
5
+ it "removes elements" do
6
+ input = """
7
+ <root>
8
+ <remove>
9
+ Some contents
10
+ </remove>
11
+ <remove >
12
+ </remove >
13
+ </root>"""
14
+
15
+ output = StringIO.new
16
+ tags_to_remove = ['remove']
17
+
18
+ TagRemover.process input, output, remove_tags: tags_to_remove
19
+
20
+ expect(output.string).to eq """<root>
21
+ </root>
22
+ """
23
+ end
24
+
25
+ it "removes single tags" do
26
+ input = StringIO.new """<root>
27
+ <remove/>
28
+ <remove />
29
+ </root>
30
+ """
31
+ output = StringIO.new
32
+ tags_to_remove = ['remove']
33
+
34
+ TagRemover.process input, output, remove_tags: tags_to_remove
35
+
36
+ expect(output.string).to eq """<root>
37
+ </root>
38
+ """
39
+ end
40
+
41
+ it "removes tags with attributes" do
42
+ input = StringIO.new '''<root>
43
+ <keep foo="bar" qux="baz" >
44
+ </keep>
45
+ <keep fizz="buzz" />
46
+ <remove x="y" a="b" >
47
+ </remove>
48
+ <remove x="y" a="b" />
49
+ </root>
50
+ '''
51
+ output = StringIO.new
52
+ tags_to_remove = ['remove']
53
+
54
+ TagRemover.process input, output, remove_tags: tags_to_remove
55
+
56
+ expect(output.string).to eq '''<root>
57
+ <keep foo="bar" qux="baz" >
58
+ </keep>
59
+ <keep fizz="buzz" />
60
+ </root>
61
+ '''
62
+ end
63
+
64
+ it "deals with multiple tags on one line" do
65
+ input = StringIO.new '''<root><remove>
66
+ </remove>
67
+ <remove/><keep>
68
+ </keep>
69
+ <remove>Bad Stuff!
70
+ </remove>Happy Stuff :)<keep/>
71
+ </root>
72
+ '''
73
+ output = StringIO.new
74
+ tags_to_remove = ['remove']
75
+
76
+ TagRemover.process input, output, remove_tags: tags_to_remove
77
+
78
+ expect(output.string).to eq """<root>
79
+ <keep>
80
+ </keep>
81
+ Happy Stuff :)
82
+ <keep/>
83
+ </root>
84
+ """
85
+ end
86
+
87
+ it "keeps elements" do
88
+ input = StringIO.new """<root>
89
+ <keep>
90
+ </keep>
91
+ </root>
92
+ """
93
+ tags_to_remove = ['remove']
94
+
95
+ output = StringIO.new
96
+
97
+ TagRemover.process input, output, remove_tags: tags_to_remove
98
+
99
+ expect(output.string).to eq input.string
100
+ end
101
+
102
+ it "is ok with doing nothing" do
103
+ input = StringIO.new """<root>
104
+ </root>
105
+ """
106
+ output = StringIO.new
107
+
108
+ TagRemover.process input, output
109
+
110
+ expect(output.string).to eq input.string
111
+ end
112
+
113
+ it "removes nested tags" do
114
+ input = StringIO.new """<root>
115
+ <remove>
116
+ <remove>
117
+ </remove>
118
+ </remove>
119
+ </root>
120
+ """
121
+ tags_to_remove = ['remove']
122
+ output = StringIO.new
123
+
124
+ TagRemover.process input, output, remove_tags: tags_to_remove
125
+
126
+ expect(output.string).to eq """<root>
127
+ </root>
128
+ """
129
+ end
130
+
131
+ it "closes the streams" do
132
+ input = StringIO.new "<root></root>"
133
+ output = StringIO.new
134
+
135
+ TagRemover.process input, output, close_streams: true
136
+
137
+ expect(input).to be_closed
138
+ expect(output).to be_closed
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'tag_remover/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "tag_remover_nokogiri"
8
+ spec.version = TagRemover::VERSION
9
+ spec.authors = ["Daniel Smith"]
10
+ spec.email = ["jellymann@gmail.com"]
11
+ spec.summary = %q{Remove elements from large XML documents (using nokogiri).}
12
+ spec.description = %q{Tag remover let's you remove all elements of specified tags from extremely large XML documents without parsing or loading the whole thing in memory, useful for processing unreasonably large documents without making your server fall over.}
13
+ spec.homepage = "https://github.com/jellymann/tag_remover_nokogiri"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.required_ruby_version = ">= 1.9.2"
22
+
23
+ spec.add_runtime_dependency "nokogiri", "~> 1.5", '>= 1.5.5'
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.6"
26
+ spec.add_development_dependency "rake", "~> 10.0"
27
+ spec.add_development_dependency "rspec", "~> 3.1"
28
+ spec.add_development_dependency "pry", "~> 0.10"
29
+ end
metadata ADDED
@@ -0,0 +1,138 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tag_remover_nokogiri
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Daniel Smith
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-02-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.5.5
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.5'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.5.5
33
+ - !ruby/object:Gem::Dependency
34
+ name: bundler
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '1.6'
40
+ type: :development
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '1.6'
47
+ - !ruby/object:Gem::Dependency
48
+ name: rake
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '10.0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '10.0'
61
+ - !ruby/object:Gem::Dependency
62
+ name: rspec
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '3.1'
68
+ type: :development
69
+ prerelease: false
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '3.1'
75
+ - !ruby/object:Gem::Dependency
76
+ name: pry
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '0.10'
82
+ type: :development
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '0.10'
89
+ description: Tag remover let's you remove all elements of specified tags from extremely
90
+ large XML documents without parsing or loading the whole thing in memory, useful
91
+ for processing unreasonably large documents without making your server fall over.
92
+ email:
93
+ - jellymann@gmail.com
94
+ executables:
95
+ - rmtags
96
+ extensions: []
97
+ extra_rdoc_files: []
98
+ files:
99
+ - ".gitignore"
100
+ - ".rspec"
101
+ - Gemfile
102
+ - LICENSE.txt
103
+ - README.md
104
+ - Rakefile
105
+ - bin/rmtags
106
+ - lib/tag_remover.rb
107
+ - lib/tag_remover/version.rb
108
+ - spec/spec_helper.rb
109
+ - spec/tag_remover_spec.rb
110
+ - tag_remover_nokogiri.gemspec
111
+ homepage: https://github.com/jellymann/tag_remover_nokogiri
112
+ licenses:
113
+ - MIT
114
+ metadata: {}
115
+ post_install_message:
116
+ rdoc_options: []
117
+ require_paths:
118
+ - lib
119
+ required_ruby_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: 1.9.2
124
+ required_rubygems_version: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - ">="
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ requirements: []
130
+ rubyforge_project:
131
+ rubygems_version: 2.4.5
132
+ signing_key:
133
+ specification_version: 4
134
+ summary: Remove elements from large XML documents (using nokogiri).
135
+ test_files:
136
+ - spec/spec_helper.rb
137
+ - spec/tag_remover_spec.rb
138
+ has_rdoc: