wp2txt 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c05af7e5c72b073f18b53eca8619212f0928aaa1
4
- data.tar.gz: 02dc116458041b096fd811b3fec6ffb6e6ff3ee7
3
+ metadata.gz: bcfd6986e262e455c100d664583b099d57ff4428
4
+ data.tar.gz: 68c922b43951b7f326b0136681981a166208d0a9
5
5
  SHA512:
6
- metadata.gz: 26479a43e8e3ccebfb6578562d62b7b6d2234e924838774a11d143c8fad7e1952db0278526d202a77aad680926b96baa25c8da9e0c23f916d95ccf614d5e82a3
7
- data.tar.gz: 8959c9b51efb18386cf556c67439332fc43723d8de8abfc014745d0d0ad1f89ba56b6222b6f8d2a33748343e2dbef93752ee104c8a54c57e01fe21bee7e2f892
6
+ metadata.gz: 294e0f8e1d2b37534ad885c617cfbd72ad72144dca6fb01231f6e2cf691a86bf58690f5dd1b2b410f8ee23eb3c74fa3f40e4ca8bbf3f3921ea78295783da5f2e
7
+ data.tar.gz: 71a1b8feca5c3067ff534f0239c4c937485ff3ed8c0a6de793be0b71befd440cb5b1c2c465dc6684c558a53209ab6481dd2e60edfe7fb7cbdd9c6f07416efd24
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ $: << File.join(File.dirname(__FILE__))
5
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
6
+
7
+ require 'wp2txt'
8
+ require 'wp2txt/utils'
9
+ include Wp2txt
10
+ require 'benchmark'
11
+
12
+ data_dir = File.join(File.dirname(__FILE__), '..', "data")
13
+
14
+ parent = Wp2txt::CmdProgbar.new
15
+ input_file = File.join(data_dir, "testdata.bz2")
16
+ output_dir = data_dir
17
+ tfile_size = 10
18
+ convert = true
19
+ strip_tmarker = true
20
+
21
+
22
+
23
+ Benchmark.bm do |x|
24
+ x.report do
25
+ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
26
+ wpconv.extract_text do |article|
27
+ title = format_wiki article.title
28
+ title = "[[#{title}]]\n"
29
+
30
+ contents = "\nCATEGORIES: "
31
+ contents += article.categories.join(", ")
32
+ contents += "\n\n"
33
+
34
+ article.elements.each do |e|
35
+ case e.first
36
+ when :mw_heading
37
+ line = format_wiki(e.last)
38
+ when :mw_paragraph
39
+ line = format_wiki(e.last)
40
+ when :mw_table, :mw_htable
41
+ line = format_wiki(e.last)
42
+ when :mw_pre
43
+ line = e.last
44
+ when :mw_quote
45
+ line = format_wiki(e.last)
46
+ when :mw_unordered, :mw_ordered, :mw_definition
47
+ line = format_wiki(e.last)
48
+ when :mw_redirect
49
+ line = format_wiki(e.last)
50
+ line += "\n\n"
51
+ else
52
+ next
53
+ end
54
+ contents += line
55
+ contents = remove_templates(contents)
56
+ end
57
+
58
+ ##### cleanup #####
59
+ if /\A\s*\z/m =~ contents
60
+ result = ""
61
+ else
62
+ result = title + "\n" + contents
63
+ end
64
+ result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
65
+ result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
66
+ end
67
+ end
68
+ end
69
+
data/bin/wp2txt CHANGED
@@ -45,7 +45,7 @@ output_dir = opts[:output_dir]
45
45
  tfile_size = opts[:file_size]
46
46
  convert = opts[:convert]
47
47
  strip_tmarker = opts[:marker] ? false : true
48
- opt_array = [:title_off, :list, :heading, :table, :template, :redirect]
48
+ opt_array = [:title, :list, :heading, :table, :template, :redirect]
49
49
  config = {}
50
50
  opt_array.each do |opt|
51
51
  config[opt] = opts[opt]
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.6.0"
2
+ VERSION = "0.6.1"
3
3
  end
@@ -1,4 +1,4 @@
1
- # -*- coding: utf-8 -*-
1
+ # -*- coding: utf-8 -*-
2
2
  $:.push File.expand_path("../lib", __FILE__)
3
3
  require "wp2txt/version"
4
4
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-04 00:00:00.000000000 Z
11
+ date: 2014-10-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -99,6 +99,7 @@ description: WP2TXT extracts plain text data from Wikipedia dump file (encoded i
99
99
  email:
100
100
  - yohasebe@gmail.com
101
101
  executables:
102
+ - benchmark.rb
102
103
  - wp2txt
103
104
  extensions: []
104
105
  extra_rdoc_files: []
@@ -108,6 +109,7 @@ files:
108
109
  - LICENSE
109
110
  - README.md
110
111
  - Rakefile
112
+ - bin/benchmark.rb
111
113
  - bin/wp2txt
112
114
  - data/testdata.bz2
113
115
  - lib/wp2txt.rb
@@ -138,7 +140,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
138
140
  version: '0'
139
141
  requirements: []
140
142
  rubyforge_project: wp2txt
141
- rubygems_version: 2.4.1
143
+ rubygems_version: 2.4.2
142
144
  signing_key:
143
145
  specification_version: 4
144
146
  summary: Wikipedia dump to text converter